diff --git a/.github/workflows/Dockerfile b/.github/workflows/Dockerfile new file mode 100644 index 00000000000000..9a8ae38871c2d9 --- /dev/null +++ b/.github/workflows/Dockerfile @@ -0,0 +1,51 @@ +FROM alpine:3.20.3 + +ARG MODE= +ADD vmlinux /root/. +ADD inittab /etc/inittab +RUN mkdir /etc/init.d +ADD rcS /etc/init.d/. +RUN sed "s/\'/\"/g" /etc/profile > /.profile + + +ADD bench.sh / +ADD iperf3.sh /root/ +ADD netperf-bench.sh /root/ +ADD iperf3.static /root/ +ADD do_getpid /root/. +ADD clone /root/. +ADD futex /root/. +ADD nullptr /root/. +ADD ctest /root/. +ADD stackp /root/. +ADD init_array /root/. +ADD entropy /root/. +ADD locale /root/. +ADD vdso_test /root/. +ADD pthread /root/. +ADD test-signal-restore /root/. +ADD Makefile /root/. +ADD lmbench2/ /lmbench2 +ADD lmbench_run.sh /lmbench2/bin/x86_64-linux-gnulibc1/ +ADD netperf /root/. + + +RUN apk update && apk add utmps-libs libtirpc curl make iperf3 + +RUN if [[ ${MODE} == "um-nommu" ]]; then mkdir -p setup && cd setup && curl -L -o output.zip \ + 'https://gitlab.alpinelinux.org/thehajime/aports/-/jobs/2226372/artifacts/download?file_type=archive' \ + && unzip output.zip ; fi +RUN if [[ ${MODE} == "um-nommu" ]]; then apk add --allow-untrusted \ + setup/packages/main/x86_64/busybox-nommu-1.36.1-r29.apk \ + setup/packages/main/x86_64/musl-nommu-1.2.5-r0.apk && rm -rf setup; fi + +# for ifupdown +RUN mkdir -p /etc/network +ADD interfaces /etc/network/. + +# XXX +RUN mkdir -p /usr/share/udhcpc +ADD default.script /usr/share/udhcpc/. +RUN chmod +x /usr/share/udhcpc/default.script + +ENTRYPOINT ["/root/vmlinux", "vec0:transport=raw,ifname=eth0,depth=128,gro=1", "root=/dev/root", "rootflags=/", "rootfstype=hostfs", "rw", "mem=1024m", "loglevel=0", "init=/sbin/init"] diff --git a/.github/workflows/Makefile b/.github/workflows/Makefile new file mode 100644 index 00000000000000..c4305989198bbb --- /dev/null +++ b/.github/workflows/Makefile @@ -0,0 +1,35 @@ +SRCS := clone.c futex.c nullptr.c ctest.c stackp.c do_getpid.c init_array.c entropy.c locale.c vdso_test.c pthread.c test-signal-restore.c +BKUP := exit.c noop.c +TARGETS := $(SRCS:.c=) + +MASTER_CC ?= docker run -it --rm -v /dev/shm:/dev/shm -v /home:/home -v /tmp:/tmp --user tazaki -w `pwd` alpine-local:3.20.3 cc +CC=${MASTER_CC} + +CFLAGS_exit = -nostdlib -pie -fPIE -fomit-frame-pointer +CFLAGS += -g -O0 -static-pie +all: ${TARGETS} + +tests: ${TARGETS} + set +x + ./clone host + ./futex + ./nullptr 1 0 || RET=$$? ; echo "code="$$RET && if [ "$$RET" -ne "139" ] ; then false ; fi + ./nullptr 1 1 || RET=$$? ; echo "code="$$RET && if [ "$$RET" -ne "11" -a "$$RET" -ne "139" ] ; then false ; fi + ./nullptr 0 0 || RET=$$? ; echo "code="$$RET && if [ "$$RET" -ne "139" ] ; then false ; fi + ./nullptr 0 1 || RET=$$? ; echo "code="$$RET && if [ "$$RET" -ne "11" -a "$$RET" -ne "139" ] ; then false ; fi + ./nullptr 2 0 || RET=$$? ; echo "code="$$RET && if [ "$$RET" -ne "136" ] ; then false ; fi + ./nullptr 2 1 || RET=$$? ; echo "code="$$RET && if [ "$$RET" -ne "11" -a "$$RET" -ne "136" ] ; then false ; fi + ./nullptr + echo "test" | ./ctest + ./stackp + ./entropy 10000 + ./locale + ./vdso_test -c 10000 + ./test-signal-restore + ./pthread + ./pthread 100 + ./pthread 100 1 + set -x + +clean: + rm -f ${TARGETS} diff --git a/.github/workflows/bench.sh b/.github/workflows/bench.sh new file mode 100755 index 00000000000000..7ecfc88e471b2f --- /dev/null +++ b/.github/workflows/bench.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +mount proc /proc -t proc +export PATH=/home:/sbin:/usr/sbin:/bin:/usr/bin + + +cd /lmbench2/bin/x86_64-linux-gnulibc1 +sh lmbench_run.sh + +/root/do_getpid -c 10000 + +# test umh (usermode helper) to exec `modprobe` +ifconfig eth10 + +make -C /root tests +RET=$? +echo $RET +echo $RET > /proc/exitcode +halt -f diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000000000..c8bd507a46ea4f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,401 @@ +on: + push: + branches: + - '**' + #pull_request: + # branches: + # - master + workflow_dispatch: + inputs: + debug_enabled: + description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' + required: false + default: true + +jobs: + tests: + runs-on: ${{ matrix.runs_on }} + strategy: + fail-fast: false + matrix: + include: + - displayTargetName: ubuntu-22.04 + os: unix + runs_on: ubuntu-22.04 + shell: bash + defconfig: defconfig + add_configs: CONFIG_UML_NET_VECTOR=y CFLAGS+="-DCONFIG_UML_NET_VECTOR" + kunit_opts: --kconfig_add CONFIG_KUNIT_UML_PCI=n ### XXX + testname: um-mmu + - displayTargetName: ubuntu-22.04 (time-travel) + os: unix + runs_on: ubuntu-22.04 + shell: bash + defconfig: defconfig + add_configs: CONFIG_UML_NET_VECTOR=y CFLAGS+="-DCONFIG_UML_NET_VECTOR" CFLAGS+="-DCONFIG_UML_TIME_TRAVEL_SUPPORT" CFLAGS+="-DCONFIG_UML_MAX_USERSPACE_ITERATIONS=100000" + kunit_opts: --kconfig_add CONFIG_KUNIT_UML_PCI=n ### XXX + testname: um-mmu-tt + - displayTargetName: ubuntu-22.04 (nommu) + os: unix + runs_on: ubuntu-22.04 + shell: bash + defconfig: x86_64_nommu_defconfig + add_configs: CONFIG_UML_NET_VECTOR=y CFLAGS+="-DCONFIG_UML_NET_VECTOR" + kunit_opts: --kconfig_add CONFIG_MMU=n --kconfig_add CONFIG_KUNIT_UML_PCI=n + testname: um-nommu + name: UML-${{ matrix.displayTargetName }} + timeout-minutes: 100 + env: + CCACHE_DIR: ${{ github.workspace }}/.ccache + USE_CCACHE: 1 + + defaults: + run: + shell: ${{ matrix.shell }} + + steps: + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} + with: + detached: true + - name: Set env + shell: bash + run: | + echo "/usr/lib/ccache/bin:/usr/lib/ccache:${{ github.workspace }}/bin" >> $GITHUB_PATH + echo "export PATH=/usr/lib/ccache/bin:/usr/lib/ccache:${{ github.workspace }}/bin:$PATH" >> $HOME/.bashrc + - name: Checkout + uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: ${{ env.CCACHE_DIR }} + key: ${{ runner.os }}-${{ matrix.testname }}-ccache-build-${{ github.sha }} + restore-keys: ${{ runner.os }}-${{ matrix.testname }}-ccache-build- + - name: Install packages + run: | + sudo apt update -y + sudo apt install -y ccache iperf3 netperf gdb + - name: Setup latest Alpine Linux + uses: jirutka/setup-alpine@v1 + with: + branch: v3.20 + packages: > + alpine-sdk + doas + libtirpc-dev + linux-headers + autoconf + automake + - name: build bench tools + run: | + git clone https://github.com/ricarkol/lmbench2.git + git clone https://github.com/HewlettPackard/netperf + cd lmbench2 + touch src/Makefile + touch Makefile + make + cd .. + cd netperf + sh autogen.sh + ./configure; make CFLAGS="-fcommon" || true + cd .. + make -C .github/workflows/ MASTER_CC=gcc + shell: alpine.sh {0} + # --root {0} + - name: Setup faketty + uses: Yuri6037/Action-FakeTTY@v1.1 + - name: setup tap interface + run: | + set -x + sudo ip tuntap add dev tap100 mode tap user ${USER} + sudo ip add add 192.168.122.1/24 dev tap100 + sudo ip link set up dev tap100 + cp .github/workflows/gdbinit ~/.gdbinit + iperf3 -s & + - name: build-0 (static) + run: | + make ARCH=um ${{ matrix.defconfig }} O=build + cat build/.config | sed "s/.*CONFIG_STATIC_LINK.*/CONFIG_STATIC_LINK=y/" > /tmp/a; mv /tmp/a build/.config + make -j8 ARCH=um O=build ${{ matrix.add_configs }} + - name: prep for docker build + if: matrix.testname != 'um-mmu-tt' + run: | + cp build/vmlinux .github/workflows/ + cp -rpf lmbench2 .github/workflows/ + cp netperf/src/netperf .github/workflows/ + - name: Build and push Docker image (no push) + if: matrix.testname != 'um-mmu-tt' + uses: docker/build-push-action@v6 + env: + DOCKER_BUILD_SUMMARY: false + with: + context: .github/workflows + build-args: "MODE=${{ matrix.testname }}" + push: false + tags: | + ghcr.io/thehajime/alpine:3.20.3-${{ matrix.testname }} + - name: image for test + if: matrix.testname != 'um-mmu-tt' + run: | + docker create --name alpine-nommu ghcr.io/thehajime/alpine:3.20.3-${{ matrix.testname }} + docker start alpine-nommu + docker wait alpine-nommu + docker logs alpine-nommu + docker export alpine-nommu > alpine.tar + docker rm alpine-nommu + mnt=$(mktemp -d) + dd if=/dev/zero of=alpine.ext4 bs=1 count=0 seek=1G + sudo chmod og+wr "alpine.ext4" + yes 2>/dev/null | mkfs.ext4 "alpine.ext4" || true + sudo mount "alpine.ext4" $mnt + sudo tar -xf alpine.tar -C $mnt + sudo umount $mnt + mkdir -p rootfs + sudo tar -xf alpine.tar -C rootfs + sudo chown -R ${USER} rootfs + - name: test-0 (static) + if: matrix.testname != 'um-mmu-tt' + run: | + sudo sh -c "echo 0 > /proc/sys/vm/mmap_min_addr" + faketty ./build/vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 ubd0=./alpine.ext4 rw mem=1024m loglevel=8 console=tty init=/sbin/init 2>&1 | tee /tmp/log.txt & + sleep 10 && pkill vmlinux + echo "=========" + cat /tmp/log.txt + - name: kunit test + if: matrix.testname != 'um-mmu-tt' + run: | + for config in `find ./ -name .kunitconfig | grep -v -E "kfence|sunrpc|handshake|kcsan|gpu|damon|rust|android|sound|hid|firewire|iommu" `; do \ + echo "==" $config "==" ; \ + ./tools/testing/kunit/kunit.py run --kunitconfig=$config ${{ matrix.kunit_opts }} + done + - name: build-1 + run: | + make ARCH=um ${{ matrix.defconfig }} + make -j8 ARCH=um ${{ matrix.add_configs }} + - name: test-1 + if: matrix.testname != 'um-mmu-tt' + run: | + faketty ./vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 ubd0=./alpine.ext4 rw mem=1024m loglevel=8 init=/sbin/init 2>&1 | tee /tmp/log.txt & + sleep 10 && pkill vmlinux + echo "=========" + cat /tmp/log.txt + - name: benchmark-0 + if: matrix.testname == 'um-nommu' + run: | + mkdir -p output + ## disable zpoline test for a while + faketty ./vmlinux ubd0=./alpine.ext4 rw mem=1024m loglevel=8 console=tty zpoline=1 init=/bench.sh \ + | tee output/${{ matrix.testname }}-zpoline.dat + faketty ./vmlinux ubd0=./alpine.ext4 rw mem=1024m loglevel=8 console=tty zpoline=0 init=/bench.sh \ + | tee output/${{ matrix.testname }}-seccomp.dat + - name: benchmark-1 + if: matrix.testname == 'um-mmu' + run: | + mkdir -p output + faketty ./vmlinux ubd0=./alpine.ext4 rw mem=1024m loglevel=0 console=tty init=/bench.sh \ + | tee output/${{ matrix.testname }}.dat + faketty ./vmlinux ubd0=./alpine.ext4 rw mem=1024m loglevel=0 console=tty seccomp=on init=/bench.sh \ + | tee output/${{ matrix.testname }}-seccomp.dat + - name: benchmark-2-iperf3 + if: matrix.testname != 'um-mmu-tt' + run: | + mkdir -p output + if [ "${{ matrix.testname }}" == "um-nommu" ] ; then + ## dynamic build failed at iperf -R + echo "===========================non-static build may fail...===========================" + #faketty ./vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 mem=1024m \ + # root=/dev/root rootflags=`pwd`/rootfs rootfstype=hostfs rw \ + # loglevel=0 console=tty init=/root/iperf3.sh \ + # | tee output/${{ matrix.testname }}-seccomp-iperf3.dat + # faketty ./vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 mem=1024m \ + # root=/dev/root rootflags=`pwd`/rootfs rootfstype=hostfs rw \ + # loglevel=0 console=tty zpoline=1 init=/root/iperf3.sh \ + # | tee output/${{ matrix.testname }}-zpoline-iperf3.dat + faketty ./build/vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 mem=1024m \ + root=/dev/root rootflags=`pwd`/rootfs rootfstype=hostfs rw \ + loglevel=0 console=tty init=/root/iperf3.sh \ + | tee output/${{ matrix.testname }}-seccomp-iperf3.dat + faketty ./build/vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 mem=1024m \ + root=/dev/root rootflags=`pwd`/rootfs rootfstype=hostfs rw \ + loglevel=0 console=tty zpoline=1 init=/root/iperf3.sh \ + | tee output/${{ matrix.testname }}-zpoline-iperf3.dat + else + faketty ./build/vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 mem=1024m \ + root=/dev/root rootflags=`pwd`/rootfs rootfstype=hostfs rw \ + loglevel=0 console=tty init=/root/iperf3.sh \ + | tee output/${{ matrix.testname }}-iperf3.dat + faketty ./build/vmlinux vec0:transport=tap,ifname=tap100,depth=128,gro=1 mem=1024m \ + root=/dev/root rootflags=`pwd`/rootfs rootfstype=hostfs rw \ + loglevel=0 console=tty seccomp=on init=/root/iperf3.sh \ + | tee output/${{ matrix.testname }}-seccomp-iperf3.dat + fi + - name: Set up Docker Buildx + if: matrix.testname != 'um-mmu-tt' + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host + - name: Log in to the ghcr.io + if: matrix.testname != 'um-mmu-tt' + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push Docker image + if: matrix.testname != 'um-mmu-tt' + uses: docker/build-push-action@v6 + env: + DOCKER_BUILD_SUMMARY: false + with: + context: .github/workflows + build-args: "MODE=${{ matrix.testname }}" + push: true + tags: | + ghcr.io/thehajime/alpine:3.20.3-${{ matrix.testname }} + - uses: actions/upload-artifact@v4 + if: matrix.testname != 'um-mmu-tt' + with: + name: bench-result-${{ matrix.testname }} + path: output/${{ matrix.testname }}* + + checkpatch: + runs-on: ubuntu-22.04 + name: checkpatch and co + env: + CCACHE_DIR: ${{ github.workspace }}/.ccache + USE_CCACHE: 1 + steps: + - name: Set env + shell: bash + run: | + echo "/usr/lib/ccache/bin:/usr/lib/ccache:${{ github.workspace }}/bin" >> $GITHUB_PATH + echo "export PATH=/usr/lib/ccache/bin:/usr/lib/ccache:${{ github.workspace }}/bin:$PATH" >> $HOME/.bashrc + - name: Checkout + with: + fetch-depth: 0 + uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: ${{ env.CCACHE_DIR }} + key: ${{ runner.os }}-checkpatch-ccache-build-${{ github.sha }} + restore-keys: ${{ runner.os }}-checkpatch-ccache-build- + - name: Install packages + run: | + sudo pip install ply GitPython mypy + sudo apt update -y + sudo apt install -y aspell ccache sparse + git clone https://github.com/daxtens/smart-sparse-diff + #- name: Setup tmate session + # uses: mxschmitt/action-tmate@v3 + # with: + # detached: true + - name: checkout trees + run: | + #git remote add linus git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git + #git fetch linus + #git format-patch -o p1 linus/master..HEAD~1 + git remote add uml git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git + git fetch uml + git checkout uml/next + git checkout zpoline-nommu-v6.10 + #- name: build (sparse) + # run: | + # make ARCH=um W=1 C=2 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__ -fmax-errors=unlimited -fmax-warnings=unlimited' defconfig O=build-sparse + # make ARCH=um W=1 C=2 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__ -fmax-errors=unlimited -fmax-warnings=unlimited' O=build-sparse clean + # make ARCH=um W=1 C=2 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__ -fmax-errors=unlimited -fmax-warnings=unlimited' O=build-sparse -j8 2> old-sparse.log 1> /dev/null + # git checkout zpoline-nommu-v6.10 + # git checkout HEAD~1 + # make ARCH=um W=1 C=2 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__ -fmax-errors=unlimited -fmax-warnings=unlimited' O=build-sparse -j8 2> new-sparse.log 1> /dev/null + # ./smart-sparse-diff/smart-sparse-diff.py old-sparse.log new-sparse.log + - name: check coding style + continue-on-error: true + run: | + git format-patch -o p1 uml/next..HEAD~19 + ./scripts/checkpatch.pl --summary-file --ignore FILE_PATH_CHANGES \ + --ignore AVOID_EXTERNS p1/*.patch + - name: check spells on commit message + run: | + git log uml/next..HEAD~19 |grep -v -E "^commit" > /tmp/a + aspell list /tmp/a + + bench-result: + runs-on: ubuntu-22.04 + name: bench-result + needs: tests + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + path: output + pattern: bench-result-* + merge-multiple: true + - name: Display structure of downloaded files + run: ls -R output + - name: Install packages + run: | + sudo apt update -y + sudo apt install -y gnuplot lmbench iperf3 netperf + - name: benchmark-host + run: | + mkdir -p output + sudo ln -s /usr/include/x86_64-linux-gnu/sys /usr/include/sys + sudo mkdir -p /var/tmp/lmbench/ + sudo chown ${USER} /var/tmp/lmbench/ + sh .github/workflows/lmbench_run.sh |& tee output/native.dat + gcc -o do_getpid .github/workflows/do_getpid.c + ./do_getpid -c 100 | tee -a output/native.dat + iperf3 -s & + iperf3 -c localhost -fm | tee output/native-iperf3.dat + iperf3 -c localhost -fm -R | tee -a output/native-iperf3.dat + bash .github/workflows/netperf-bench.sh localhost netperf | tee -a output/native-iperf3.dat + - name: bench data parse/out + run: | + bash .github/workflows/um-nommu-plot.sh output |& tee um-nommu.log + bash .github/workflows/netperf-plot.sh output |& tee -a um-nommu.log + export TMP_OUTPUT=$(cat um-nommu.log) + echo "TMP_OUTPUT<> $GITHUB_ENV + echo "$TMP_OUTPUT" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + echo ${{env.TMP_OUTPUT}} + - uses: actions/upload-artifact@v4 + with: + name: bench-result-all + path: output/out/* + - name: publish to imgur + uses: devicons/public-upload-to-imgur@v2.2.2 + id: lmbench-imgur + with: + path: "output/out/lmbench.png" + client_id: ${{secrets.IMGUR_CLIENT_ID}} + - name: publish to imgur-iperf + uses: devicons/public-upload-to-imgur@v2.2.2 + id: iperf3-imgur + with: + path: "output/out/iperf3.png" + client_id: ${{secrets.IMGUR_CLIENT_ID}} + - name: publish to imgur-netperf + uses: devicons/public-upload-to-imgur@v2.2.2 + id: netperf-imgur + with: + path: "output/out/tcp-stream.png" + client_id: ${{secrets.IMGUR_CLIENT_ID}} + - name: prepare action post + run: | + export SCRIPT_OUTPUT="${{ join(fromJSON(steps.lmbench-imgur.outputs.markdown_urls)) }} ${{ join(fromJSON(steps.iperf3-imgur.outputs.markdown_urls)) }} ${{ join(fromJSON(steps.netperf-imgur.outputs.markdown_urls)) }}" + echo "SCRIPT_OUTPUT<> $GITHUB_ENV + echo "$TMP_OUTPUT" >> $GITHUB_ENV + echo "" >> $GITHUB_ENV + echo "$SCRIPT_OUTPUT" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + echo ${{env.SCRIPT_OUTPUT}} + - uses: actions/github-script@v7 + env: + COMMENT_BODY: ${{env.SCRIPT_OUTPUT}} + with: + script: | + github.rest.issues.createComment({ + issue_number: 1, + owner: context.repo.owner, + repo: context.repo.repo, + body: process.env.COMMENT_BODY + }) diff --git a/.github/workflows/clone.c b/.github/workflows/clone.c new file mode 100644 index 00000000000000..811b61f6b105f6 --- /dev/null +++ b/.github/workflows/clone.c @@ -0,0 +1,85 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ +} while (0) + + static int /* Start function for cloned child */ +childFunc(void *arg) +{ + struct utsname uts; + + printf("arg=%lx\n", (unsigned long)arg); + + /* Change hostname in UTS namespace of child */ + + if (sethostname(arg, strlen(arg)) == -1) + errExit("sethostname"); + + /* Retrieve and display hostname */ + + if (uname(&uts) == -1) + errExit("uname"); + printf("uts.nodename in child: %s\n", uts.nodename); + + /* Keep the namespace open for a while, by sleeping. + This allows some experimentation--for example, another + process might join the namespace. */ + + return 0; /* Child terminates now */ +} + +#define STACK_SIZE (1024 * 1024) /* Stack size for cloned child */ + + int +main(int argc, char *argv[]) +{ + char *stack; /* Start of stack buffer */ + char *stackTop; /* End of stack buffer */ + pid_t pid; + struct utsname uts; + + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(EXIT_SUCCESS); + } + + /* Allocate stack for child */ + + stack = malloc(STACK_SIZE); + if (stack == NULL) + errExit("malloc"); + stackTop = stack + STACK_SIZE; /* Assume stack grows downward */ + + /* Create child that has its own UTS namespace; + child commences execution in childFunc() */ + + printf("arg=%lx\n", (unsigned long)argv[1]); + pid = clone(childFunc, stackTop, CLONE_NEWUTS | SIGCHLD, argv[1]); + if (pid == -1) + errExit("clone"); + sleep(1); + printf("clone() returned %ld\n", (long) pid); + + /* Parent falls through to here */ + + /* Display hostname in parent's UTS namespace. This will be + different from hostname in child's UTS namespace. */ + + if (uname(&uts) == -1) + errExit("uname"); + printf("uts.nodename in parent: %s\n", uts.nodename); + + if (waitpid(pid, NULL, 0) == -1) /* Wait for child */ + errExit("waitpid"); + printf("child has terminated\n"); + + exit(EXIT_SUCCESS); +} + diff --git a/.github/workflows/ctest.c b/.github/workflows/ctest.c new file mode 100644 index 00000000000000..791c27a6869938 --- /dev/null +++ b/.github/workflows/ctest.c @@ -0,0 +1,259 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + + +static void __kill(int pid, int sig) { + register int syscall_no asm("rax") = 62; + register int arg1 asm("rdi") = pid; + register int arg2 asm("rsi") = sig; + asm("syscall"); +} + +static void exit_host(int r){ + __kill(0, 9); + return; +} + +int vfork1() +{ + int a = 0; + int ret = 0xfafafafa; + ret = vfork(); + return ret; +} + +int vfork2() +{ + return vfork(); +} + +void test_files(); + +char *argv[3] = {"/ctest", "noop", NULL}; +char *env[2] = {"PATH=/", NULL}; + +int test_vfork() +{ + int i, ret; + int status; + + ret = vfork(); + if (ret == 0) { + printf("child: ret=%d\r\n", ret); + //printf("got %c\n", getchar()); + ret = execve("./ctest", argv, env); + if (ret != 0) { + printf("child: exec ret=%d\r\n", ret); + while(1); + _exit(0); + } + + // should not be here + test_files(); + _exit(0); + } else if (ret < 0) { + printf("error %d\r\n", ret); + while(1); + _exit(0); + } else { + wait(&status); + //while(1); + printf("parent (%s): fork status=%d child_pid=%d\r\n", __FUNCTION__, status, ret); + } + return ret; +} + +void test_files() +{ + printf("opening\r\n"); + char *tmpname = "/files/mifile"; + char buf[128]; + int fd = open("/files/mifile2", O_RDONLY, 0); + if (fd < 1) { + printf("could not open %d\n", fd); + return; + } + //write(fd, "_*_", 3); + fsync(fd); + close(fd); + printf("opening\r\n"); + fd = open("/files/mifile2", O_RDONLY, 0); + if (fd < 1) { + printf("could not open %d\n", fd); + return; + } + + memset(buf, 0, 128); + read(fd, buf, 64); + printf("%s\r\n", buf); +} + +int test_vfork_only() +{ + int i, ret; + int status; + + ret = vfork1(); + if (ret == 0) { + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + printf("child %s\n", __FUNCTION__); + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + sleep(1); + _exit(0); + } else if (ret < 0) { + printf("error %d\r\n", ret); + while(1); + _exit(0); + } else { + sleep(1); + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + printf("parent (%s): fork status=%d child_pid=%d\r\n", __FUNCTION__, status, ret); + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + } + return ret; +} + + + +int test_vfork_sleeping() +{ + int i, ret; + int status; + + ret = vfork(); + if (ret == 0) { + ret = execve("./ctest", argv, env); + _exit(0); + } else if (ret < 0) { + printf("error %d\r\n", ret); + while(1); + _exit(0); + } else { + printf("parent (%s): fork status=%d child_pid=%d\r\n", __FUNCTION__, status, ret); + } + return ret; +} + + +int test_vfork_in_vfork() +{ + int i, ret; + int status; + + ret = vfork(); + if (ret == 0) { + printf("child 1\n"); + ret = vfork(); + if (ret == 0) { + printf("child 2\n"); + _exit(0); + } else if (ret < 0) { + printf("error %d\r\n", ret); + while(1); + _exit(0); + } else { + printf("parent: fork status=%d child_pid=%d\r\n", status, ret); + } + _exit(0); + } else if (ret < 0) { + printf("error %d\r\n", ret); + while(1); + _exit(0); + } else { + printf("parent (%s): fork status=%d child_pid=%d\r\n", __FUNCTION__, status, ret); + } + return ret; +} + +int test_exec_with_vfork_in_vfork() +{ + int i, ret; + int status; + + char *argv[3] = {"/ctest", "fork", NULL}; + char *env[2] = {"PATH=/", NULL}; + + ret = vfork(); + if (ret == 0) { + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + printf("child %s\n", __FUNCTION__); + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + ret = execve("./ctest", argv, env); + while(1); // should not be reached + } else if (ret < 0) { + printf("error %d\r\n", ret); + while(1); + _exit(0); + } else { + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + printf("parent (%s): fork status=%d child_pid=%d\r\n", __FUNCTION__, status, ret); + printf("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"); + } + return ret; +} + + +int main(int argc, char **argv) +{ + int i, ret; + int iterations; + + if (argc < 2) { + iterations = 10; + } else if (strcmp(argv[1], "files") == 0) { + test_files(); + exit(0); + } else if (strcmp(argv[1], "loop") == 0) { + printf("looping forever\n"); + //printf("got %c\n", getchar()); + while(1); + exit(0); + } else if (strcmp(argv[1], "fork") == 0) { + //test_vfork_only(); + test_vfork(); + exit(0); + } else if (strcmp(argv[1], "exec") == 0) { + test_exec_with_vfork_in_vfork(); + test_files(); + exit(0); + } else if (strcmp(argv[1], "noop") == 0) { + printf("noop\n"); + exit(0); + } else if (strcmp(argv[1], "exit") == 0) { + printf("exit"); + exit_host(0); + } else if (strcmp(argv[1], "time") == 0) { + struct timeval current_time; + gettimeofday(¤t_time, NULL); + printf("seconds : %ld\nmicro seconds : %ld\n", + current_time.tv_sec, current_time.tv_usec); + exit(0); + } else { + printf("default arg\n"); + } + + test_exec_with_vfork_in_vfork(); + + test_vfork(); + test_vfork(); + test_vfork(); + test_vfork(); + test_vfork_sleeping(); + + test_files(); + + printf("got %c\n", getchar()); + + //while(1); + exit(0); +} diff --git a/.github/workflows/default.script b/.github/workflows/default.script new file mode 100644 index 00000000000000..9395587148be2b --- /dev/null +++ b/.github/workflows/default.script @@ -0,0 +1,190 @@ +#!/bin/sh + +# script for udhcpc +# Copyright (c) 2008 Natanael Copa + +UDHCPC="/etc/udhcpc" +UDHCPC_CONF="$UDHCPC/udhcpc.conf" + +RESOLV_CONF="/etc/resolv.conf" +[ -f "$UDHCPC_CONF" ] && . "$UDHCPC_CONF" + +export RESOLV_CONF + +export broadcast +export dns +export domain +export interface +export ip +export mask +export metric +export staticroutes +export router +export subnet + +export PATH=/usr/bin:/bin:/usr/sbin:/sbin + +run_scripts() { + local dir="$1" + local script + + if [ -d "$dir" ]; then + for script in "$dir"/*; do + if [ -f "$script" ] && [ -x "$script" ]; then + "$script" + fi + done + fi +} + +deconfig() { + ip -4 addr flush dev $interface +} + +is_wifi() { + test -e /sys/class/net/$interface/phy80211 +} + +if_index() { + if [ -e /sys/class/net/$interface/ifindex ]; then + cat /sys/class/net/$interface/ifindex + else + ip -4 link show dev $interface | head -n1 | cut -d: -f1 + fi +} + +calc_metric() { + local base= + if is_wifi; then + base=300 + else + base=200 + fi + echo $(( base + $(if_index) )) +} + +route_add() { + local to=$1 gw=$2 num=$3 + # special case for /32 subnets: + # /32 instructs kernel to always use routing for all outgoing packets + # (they can never be sent to local subnet - there is no local subnet for /32). + # Used in datacenters, avoids the need for private ip-addresses between two hops. + if [ "$subnet" = "255.255.255.255" ]; then + ip -4 route add $gw dev $interface + fi + ip -4 route add $to via $gw dev $interface \ + metric $(( $num + ${IF_METRIC:-$(calc_metric)} )) +} + +routes() { + [ -z "$router" ] && [ -z "$staticroutes" ] && return + local iface= + for iface in $NO_GATEWAY; do + [ "$iface" = "$interface" ] && return + done + while ip -4 route del default via dev $interface 2>/dev/null; do + : + done + local num=0 + # RFC3442: + # If the DHCP server returns both a Classless Static Routes option + # and a Router option, the DHCP client MUST ignore the Router option. + if [ -n "$staticroutes" ]; then + # static routes format: dest1/mask gw1 ... destn/mask gwn + set -- $staticroutes + while [ -n "$1" ] && [ -n "$2" ]; do + local dest="$1" gw="$2" + if [ "$gw" != "0.0.0.0" ]; then + route_add $dest $gw $num && num=$(( num + 1)) + fi + shift 2 + done + else + local gw= + for gw in $router; do + route_add 0.0.0.0/0 $gw $num && num=$(( num + 1 )) + done + fi +} + +resolv_conf() { + local i + [ -n "$IF_PEER_DNS" ] && [ "$IF_PEER_DNS" != "yes" ] && return + if [ "$RESOLV_CONF" = "no" ] || [ "$RESOLV_CONF" = "NO" ] \ + || [ -z "$RESOLV_CONF" ] || [ -z "$dns" ]; then + return + fi + for i in $NO_DNS; do + [ "$i" = "$interface" ] && return + done + echo -n > "$RESOLV_CONF.$$" + if [ -n "$search" ]; then + echo "search $search" >> "$RESOLV_CONF.$$" + elif [ -n "$domain" ]; then + echo "search $domain" >> "$RESOLV_CONF.$$" + fi + for i in $dns; do + echo "nameserver $i" >> "$RESOLV_CONF.$$" + done + chmod a+r "$RESOLV_CONF.$$" + if grep -q "^# Generated by resolvconf" "$RESOLV_CONF" 2>/dev/null \ + && [ -x "$(command -v resolvconf)" ]; then + resolvconf -a "udhcpc" < "$RESOLV_CONF.$$" + rm "$RESOLV_CONF.$$" + else + mv -f "$RESOLV_CONF.$$" "$RESOLV_CONF" + fi +} + +bound() { + ip -4 addr add $ip/$mask ${broadcast:+broadcast $broadcast} dev $interface + ip -4 link set dev $interface up + routes + resolv_conf +} + +renew() { + if ! ip -4 addr show dev $interface | grep $ip/$mask; then + ip -4 addr flush dev $interface + ip -4 addr add $ip/$mask ${broadcast:+broadcast $broadcast} dev $interface + fi + + local i + for i in $router; do + if ! ip -4 route show | grep ^default | grep $i; then + routes + break + fi + done + + if ! grep "^search $domain"; then + resolv_conf + return + fi + for i in $dns; do + if ! grep "^nameserver $i"; then + resolv_conf + return + fi + done +} + +case "$1" in + deconfig|renew|bound) + run_scripts "$UDHCPC/pre-$1" + $1 + run_scripts "$UDHCPC/post-$1" + ;; + leasefail) + echo "udhcpc failed to get a DHCP lease" >&2 + ;; + nak) + echo "udhcpc received DHCP NAK" >&2 + ;; + *) + echo "Error: this script should be called from udhcpc" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/.github/workflows/do_getpid.c b/.github/workflows/do_getpid.c new file mode 100644 index 00000000000000..9a1171ded536ed --- /dev/null +++ b/.github/workflows/do_getpid.c @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include +#include + + +#ifndef BENCH_LDPRELOAD +extern pid_t do_getpid(void); + +void __do_getpid(void) +{ + asm volatile (".globl do_getpid"); + asm volatile ("do_getpid:"); + asm volatile ("movq $39, %rax"); + asm volatile ("syscall"); + asm volatile ("ret"); +} +#else +pid_t do_getpid(void) +{ + return getpid(); +} +#endif + +int main(int argc, char* const* argv) +{ + int ch; + unsigned long loopcnt = 0; + + while ((ch = getopt(argc, argv, "c:")) != -1) { + switch (ch) { + case 'c': + loopcnt = atol(optarg); + break; + + default: + printf("unknown option\n"); + exit(1); + } + } + + if (!loopcnt) { + printf("please specify loop count by -c\n"); + exit(0); + } + + { + pid_t my_pid = getpid(); + { + unsigned long t; + { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + t = ts.tv_sec * 1000000000UL + ts.tv_nsec; + } + { + unsigned long i; + for (i = 0; i < loopcnt; i++) + assert(my_pid == do_getpid()); + } + { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + t = ts.tv_sec * 1000000000UL + ts.tv_nsec - t; + } + printf("average %5lu nsec\n", t / loopcnt); + } + } + + return 0; +} diff --git a/.github/workflows/entropy.c b/.github/workflows/entropy.c new file mode 100644 index 00000000000000..59ae1da93239c9 --- /dev/null +++ b/.github/workflows/entropy.c @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include + +#define BUFSIZE 256 + +struct rand_pool_info { + int entropy_count; + int buf_size; + unsigned int buf[256]; +}; + +uint8_t prng(void) { + static uint8_t seed=19; + seed = 311 * seed + 17; + return seed; +} + +#define BUF_SIZE 256 +#define MAX_ITERS 10000 + +#define RNDADDENTROPY 0x40085203 + +int main(int argc, char **argv) +{ + struct rand_pool_info *output; + int max_iters = MAX_ITERS; + int iters = 0, ret; + int fd = open("/dev/random", O_WRONLY); + + if (argc == 2) + max_iters = atoi(argv[1]); + + printf("generating \"entropy\"..."); + output = (struct rand_pool_info *)malloc(sizeof(struct rand_pool_info) + + BUF_SIZE); + do { + int i; + output->entropy_count = BUF_SIZE * 8; + output->buf_size = BUF_SIZE; + for (i=0; i< BUF_SIZE; i++) + output->buf[i] = prng(); + ret = ioctl(fd, RNDADDENTROPY, &output); + iters++; + } while((ret >= 0) && (iters < max_iters)); + + printf("for %d iters\n", iters); +} + diff --git a/.github/workflows/exit.c b/.github/workflows/exit.c new file mode 100644 index 00000000000000..b60675e1e7d381 --- /dev/null +++ b/.github/workflows/exit.c @@ -0,0 +1,13 @@ + +static void kill(int pid, int sig) { + register int syscall_no asm("rax") = 62; + register int arg1 asm("rdi") = pid; + register int arg2 asm("rsi") = sig; + asm("syscall"); +} + +void _start() +{ + kill(0, 9); + return; +} diff --git a/.github/workflows/futex.c b/.github/workflows/futex.c new file mode 100644 index 00000000000000..566e61c8773bee --- /dev/null +++ b/.github/workflows/futex.c @@ -0,0 +1,55 @@ +#include +#include + +/* this function is run by the second thread */ +void *inc_x(void *x_void_ptr) +{ + + /* increment x to 100 */ + int *x_ptr = (int *)x_void_ptr; + while(++(*x_ptr) < 100); + + printf("x increment finished\n"); + + /* the function must return something - NULL will do */ + return NULL; + +} + +int main() +{ + + int x = 0, y = 0; + + /* show the initial values of x and y */ + printf("x: %d, y: %d\n", x, y); + + /* this variable is our reference to the second thread */ + pthread_t inc_x_thread; + + /* create a second thread which executes inc_x(&x) */ + if(pthread_create(&inc_x_thread, NULL, inc_x, &x)) { + + fprintf(stderr, "Error creating thread\n"); + return 1; + + } + /* increment y to 100 in the first thread */ + while(++y < 100); + + printf("y increment finished\n"); + + /* wait for the second thread to finish */ + if(pthread_join(inc_x_thread, NULL)) { + + fprintf(stderr, "Error joining thread\n"); + return 2; + + } + + /* show the results - x is now 100 thanks to the second thread */ + printf("x: %d, y: %d\n", x, y); + + return 0; + +} diff --git a/.github/workflows/gdbinit b/.github/workflows/gdbinit new file mode 100644 index 00000000000000..24006df73d18da --- /dev/null +++ b/.github/workflows/gdbinit @@ -0,0 +1,221 @@ +#add-auto-load-safe-path /home/tazaki/gitworks/osv/scripts/loader.py + +handle SIGUSR1 nostop +#set annotate 1 +set history save on +set history size 10000 +set history filename ~/.gdb_history +set print pretty on +set print static-members off +set charset ASCII +set print thread-events off +directory /home/tazaki/work/nabla-linux/tests/busybox.git:/home/tazaki/work/nabla-linux/tests/musl-git/:/home/tazaki/work/deb-src/glibc-2.39/ + +define ntoa + set $ipv4 = $arg0 + echo IPV4 =. + p $ipv4 + set $val1 = ($ipv4 >> 24) & 0xff + set $val2 = ($ipv4 >> 16) & 0xff + set $val3 = ($ipv4 >> 8) & 0xff + set $val4 = ($ipv4 >> 0) & 0xff + printf "IPV4=%u=0x%02x.%02x.%02x.%02x =%d.%d.%d.%d\n", $ipv4, $val1, $val2, $val3, $val4, $val1, $val2, $val3, $val4 + end + +# change prompt +#source /home/tazaki/gdb-prompt.py + +define walk_list + set $temp = $arg0 + while ($temp) + printf "(%d,0x%X)-->",((sll*)$temp)->data,((sll*)$temp)->next + set $temp = ((sll*)$temp)->next + end + printf "Done\n" +end + +define walk_tailq + set $temp = $arg0->tqh_first + while ($temp) + printf "(%s,0x%lx,runnable=%d)-->", ((struct thread *)$temp)->name,(struct thread *)$temp, is_runnable((struct thread *)$temp) + set $temp = ((struct thread *)$temp)->thread_list->tqe_next + end + printf "Done\n" +end + +define lkl_hijack + set environment LD_LIBRARY_PATH=../openssl:tools/lkl/lib/hijack/:lib/hijack/ + set exec-wrapper env 'LD_PRELOAD=liblkl-hijack.so' +end + +define zpoline + set exec-wrapper env 'LD_PRELOAD=./libzpoline.so' + set environment LIBZPHOOK=libzphook_basic.so + set environment LD_LIBRARY_PATH=/home/tazaki/work/zpoline/zpoline/apps/basic/ +end + +define zpoline_lkl + set environment LKL_HIJACK_ZPOLINE=1 + set environment LIBZPHOOK=liblkl-zpoline.so + set environment LD_LIBRARY_PATH=../openssl:tools/lkl/lib/hijack/:lib/hijack/:/home/tazaki/work/zpoline/zpoline + set exec-wrapper env 'LD_PRELOAD=libzpoline.so' +end + +define zpoline_rsocket + set environment LIBZPHOOK=librszpoline.so + set environment LD_LIBRARY_PATH=./build/lib/:/home/tazaki/work/zpoline/zpoline + set exec-wrapper env 'LD_PRELOAD=libzpoline.so' +end + +define rsocket + set environment LD_LIBRARY_PATH=./build/lib/ + set exec-wrapper env 'LD_PRELOAD=librspreload.so' +end + +define upregs + set $regs = 0 + if $argc == 0 + set $regs = (((struct pt_regs *)current_ptregs)->regs->gp) + else + set $regs = ((struct uml_pt_regs *)$arg0)->gp + end + + printf "rax=0x%08lx\t\tAX[10]=0x%08lx\t\tO_AX[15]=0x%lx\n", $rax, $regs[10], $regs[15] + printf "rbx=0x%08lx\t\tBX[5]=0x%08lx\n", $rbx, $regs[5] + printf "rcx=0x%08lx\t\tCX[11]=0x%08lx\n", $rcx,$regs[11] + printf "rdx=0x%08lx\t\tDX[12]=0x%08lx\n", $rdx,$regs[12] + printf "rsi=0x%08lx\t\tSI[13]=0x%08lx\n", $rsi, $regs[13] + printf "rdi=0x%08lx\t\tDI[14]=0x%08lx\n", $rdi, $regs[14] + printf "rbp=0x%08lx\t\tBP[4]=0x%08lx\n", $rbp, $regs[4] + printf "rsp=0x%08lx\t\tSP[19]=0x%08lx\n", $rsp, $regs[19] + printf " r8=0x%08lx\t\tR8[9]=0x%08lx\n", $r8,$regs[9] + printf " r9=0x%08lx\t\tR9[8]=0x%08lx\n", $r9,$regs[8] + printf "r10=0x%08lx\t\tR10[7]=0x%08lx\n", $r10,$regs[7] + printf "r11=0x%08lx\t\tR11[6]=0x%08lx\n", $r11,$regs[6] + printf "r12=0x%08lx\t\tR12[3]=0x%08lx\n", $r12,$regs[3] + printf "r13=0x%08lx\t\tR13[2]=0x%08lx\n", $r13,$regs[2] + printf "r14=0x%08lx\t\tR14[1]=0x%08lx\n", $r14,$regs[1] + printf "r15=0x%08lx\t\tR15[0]=0x%08lx\n", $r15,$regs[0] + printf "rip=0x%08lx\t\tIP[16]=0x%08lx\n", $rip,$regs[16] + printf "eflags=0x%08lx\t\tEFLAGS[18]=0x%lx\n", $eflags,$regs[18] + printf "CS[17]=0x%lx, ", $regs[17] + printf "SS[20]=0x%lx\n", $regs[20] +end + +define pregs + if $argc == 1 + print $arg0 *(struct pt_regs *)current_ptregs + else + print *(struct pt_regs *)current_ptregs + end + printf "pregs=0x%x, AX[10]=%ld, CX[11]=0x%lx, DX[12]=0x%lx, ORIG_AX[15]=%ld, IP[16]=0x%x, SP[19]=0x%x HOST_FS[25]=0x%x, FS[21]=0x%x, FP=0x%x\n", \ + current_ptregs, ((struct pt_regs *)current_ptregs)->regs.gp[10], \ + ((struct pt_regs *)current_ptregs)->regs.gp[11], ((struct pt_regs *)current_ptregs)->regs.gp[12],\ + ((struct pt_regs *)current_ptregs)->regs.gp[15], ((struct pt_regs *)current_ptregs)->regs.gp[16],\ + ((struct pt_regs *)current_ptregs)->regs.gp[19], ((struct pt_regs *)current_ptregs)->regs.gp[25],\ + ((struct pt_regs *)current_ptregs)->regs.gp[21], ((struct pt_regs *)current_ptregs)->regs.fp +end + +## source /home/tazaki/.gdbinit_defs +## +## define syscall +## printf "syscall/rax\t = %d(%s)\n", ((struct pt_regs *)current_ptregs)->regs.gp[15], \ +## $syscall_tbl[((struct pt_regs *)current_ptregs)->regs.gp[15]] +## printf "ret/rax\t\t\t = %d\n", $rax +## printf "retp/rcx\t\t = 0x%x\n", $rcx +## printf "arg0/rdi\t\t = 0x%x\n", $rdi +## printf "arg1/rsi\t\t = 0x%x\n", $rsi +## printf "arg2/rdx\t\t = 0x%x\n", $rdx +## printf "arg3/r10\t\t = 0x%x\n", $r10 +## printf "arg4/r8\t\t\t = 0x%x\n", $r8 +## printf "arg5/r9\t\t\t = 0x%x\n", $r9 +## end + +source /home/tazaki/work/mino/mino/gdbinit-minoc + +python +import subprocess +import re + +def relocatesections(filename, addr): + p = subprocess.Popen(["readelf", "-S", filename], stdout = subprocess.PIPE) + + sections = [] + textaddr = '0' + for line in p.stdout.readlines(): + line = line.decode("utf-8").strip() + if not line.startswith('[') or line.startswith('[Nr]'): + continue + + line = re.sub(r' +', ' ', line) + line = re.sub(r'\[ *(\d+)\]', r"\g<1>", line) + fieldsvalue = line.split(' ') + fieldsname = ['number', 'name', 'type', 'addr', 'offset', 'size', 'entsize', 'flags', 'link', 'info', 'addralign'] + sec = dict(zip(fieldsname, fieldsvalue)) + + if sec['number'] == '0': + continue + + sections.append(sec) + + if sec['name'] == '.text': + textaddr = sec['addr'] + + return (textaddr, sections) + + +class AddSymbolFileAll(gdb.Command): + """The right version for add-symbol-file""" + + def __init__(self): + super(AddSymbolFileAll, self).__init__("add-symbol-file-all", gdb.COMMAND_USER) + self.dont_repeat() + + def invoke(self, arg, from_tty): + argv = gdb.string_to_argv(arg) + filename = argv[0] + + if len(argv) > 1: + offset = int(str(gdb.parse_and_eval(argv[1])), 0) + else: + offset = 0 + + (textaddr, sections) = relocatesections(filename, offset) + + cmd = "add-symbol-file %s 0x%08x" % (filename, int(textaddr, 16) + offset) + + for s in sections: + addr = int(s['addr'], 16) + if s['name'] == '.text' or addr == 0: + continue + + cmd += " -s %s 0x%08x" % (s['name'], addr + offset) + + gdb.execute(cmd) + +class RemoveSymbolFileAll(gdb.Command): + """The right version for remove-symbol-file""" + + def __init__(self): + super(RemoveSymbolFileAll, self).__init__("remove-symbol-file-all", gdb.COMMAND_USER) + self.dont_repeat() + + def invoke(self, arg, from_tty): + argv = gdb.string_to_argv(arg) + filename = argv[0] + + if len(argv) > 1: + offset = int(str(gdb.parse_and_eval(argv[1])), 0) + else: + offset = 0 + + (textaddr, _) = relocatesections(filename, offset) + + cmd = "remove-symbol-file -a 0x%08x" % (int(textaddr, 16) + offset) + gdb.execute(cmd) + + +AddSymbolFileAll() +RemoveSymbolFileAll() +end +#source /home/tazaki/work/gdb-symbols.py diff --git a/.github/workflows/init_array.c b/.github/workflows/init_array.c new file mode 100644 index 00000000000000..8faf546c978cb6 --- /dev/null +++ b/.github/workflows/init_array.c @@ -0,0 +1,19 @@ +#include + +static void init(int argc, char **argv, char **envp) { + printf("> l:%s\n", __FUNCTION__); +} + +static void fini(void) { + printf("< l:%s\n", __FUNCTION__); +} + + +__attribute__((section(".init_array"), used)) static typeof(init) *init_p = init; +__attribute__((section(".fini_array"), used)) static typeof(fini) *fini_p = fini; + +int main (int argc, char *argv[]) +{ + return 0; +} + diff --git a/.github/workflows/inittab b/.github/workflows/inittab new file mode 100644 index 00000000000000..81eef106465f99 --- /dev/null +++ b/.github/workflows/inittab @@ -0,0 +1,13 @@ +# /etc/inittab + +::sysinit:/etc/init.d/rcS + +::respawn:-/bin/sh + +# Stuff to do when restarting the init process +::restart:/sbin/init + +# Stuff to do before rebooting +::ctrlaltdel:/sbin/reboot +::shutdown:/bin/umount -a -r +::shutdown:/sbin/swapoff -a diff --git a/.github/workflows/interfaces b/.github/workflows/interfaces new file mode 100644 index 00000000000000..eeec845556e067 --- /dev/null +++ b/.github/workflows/interfaces @@ -0,0 +1,2 @@ +auto vec0 +iface vec0 inet dhcp diff --git a/.github/workflows/iperf3.sh b/.github/workflows/iperf3.sh new file mode 100755 index 00000000000000..5053573fd47e2f --- /dev/null +++ b/.github/workflows/iperf3.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +NIC=vec0 +mount proc /proc -t proc +echo "nameserver 8.8.8.8" > /etc/resolv.conf +/sbin/ifconfig lo 127.0.0.1 up +/sbin/ifconfig $NIC 192.168.122.2 up + +/sbin/sysctl -w net.ipv4.tcp_wmem="40960 873800 1677721600" +/sbin/sysctl -w net.ipv4.tcp_rmem="40960 873800 1677721600" + +sleep 5 +IPERF=/root/iperf3.static +IPERF=iperf3 + +echo "===iperf3 forward===" +$IPERF -c 192.168.122.1 -fm +echo "===iperf3 reverse===" +$IPERF -c 192.168.122.1 -R -fm + +sh /root/netperf-bench.sh + +/sbin/halt -f diff --git a/.github/workflows/iperf3.static b/.github/workflows/iperf3.static new file mode 100755 index 00000000000000..79155612c05c8a Binary files /dev/null and b/.github/workflows/iperf3.static differ diff --git a/.github/workflows/lmbench_run.sh b/.github/workflows/lmbench_run.sh new file mode 100644 index 00000000000000..26596236ed4bf6 --- /dev/null +++ b/.github/workflows/lmbench_run.sh @@ -0,0 +1,26 @@ + +PATH=/usr/lib/lmbench/bin/x86_64-linux-gnu/:/lmbench2/bin/x86_64-linux-gnulibc1:$PATH +mkdir -p /var/tmp/lmbench +cp `which hello` /var/tmp/lmbench/hello +cp `which hello` /tmp/hello + +mkdir -p /usr/include/sys/ || true +touch /usr/include/sys/types.h || true + +if [ -d "/usr/lib/lmbench/bin/x86_64-linux-gnu/" ] ; then +ENOUGH=10000 lat_select -n 10 file +ENOUGH=10000 lat_select -n 100 file +ENOUGH=10000 lat_select -n 1000 file +else +ENOUGH=10000 lat_select file 10 +ENOUGH=10000 lat_select file 100 +ENOUGH=10000 lat_select file 1000 +fi + +ENOUGH=100000 lat_syscall null +ENOUGH=100000 lat_syscall read +ENOUGH=100000 lat_syscall write +ENOUGH=100000 lat_syscall stat +ENOUGH=100000 lat_syscall open +ENOUGH=10000 lat_proc shell +ENOUGH=10000 lat_proc exec diff --git a/.github/workflows/locale.c b/.github/workflows/locale.c new file mode 100644 index 00000000000000..1d4d83c71ff4fa --- /dev/null +++ b/.github/workflows/locale.c @@ -0,0 +1,11 @@ +#include +#include +#include +#include + +int main (int argc, char *argv[]) +{ + int ret = getpid(); + printf("strerr = %s\n", strerror(errno)); + return 0; +} diff --git a/.github/workflows/netperf-bench.sh b/.github/workflows/netperf-bench.sh new file mode 100644 index 00000000000000..64aabe9782a6a7 --- /dev/null +++ b/.github/workflows/netperf-bench.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +PSIZES="64 128 256 512 1024 1500 65507" +TESTNAMES="TCP_STREAM TCP_MAERTS" +DEST_ADDR=${1:-"192.168.122.1"} +NETPERF=${2:-/root/netperf} + +/sbin/sysctl -w net.ipv4.tcp_wmem="40960 873800 1677721600" +/sbin/sysctl -w net.ipv4.tcp_rmem="40960 873800 1677721600" + +for size in $PSIZES +do + for test in $TESTNAMES + do + echo "== netperf ($size-$test) ==" + $NETPERF -H $DEST_ADDR -t $test -- -o THROUGHPUT,THROUGHPUT_UNITS,LOCAL_SEND_SIZE,COMMAND_LINE -m "$size,$size" + + done +done + +$NETPERF -H $DEST_ADDR -t TCP_RR -- -o THROUGHPUT,THROUGHPUT_UNITS,LOCAL_SEND_SIZE,COMMAND_LINE diff --git a/.github/workflows/netperf-plot.sh b/.github/workflows/netperf-plot.sh new file mode 100644 index 00000000000000..2dfd03663ac110 --- /dev/null +++ b/.github/workflows/netperf-plot.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" +OUTPUT="$1" + +mkdir -p "$OUTPUT/out/" + +# parse outputs +# TCP_STREAM +for f in `ls $OUTPUT/*-iperf3.dat` +do + cat $f | grep bits |grep STREAM | awk -F',' '{print $3" " $1}' \ + > $OUTPUT/out/`basename $f .dat|sed "s/iperf3/netperf/g"`-fwd-out.dat + cat $f | grep bits |grep MAERTS | awk -F',' '{print $3" " $1}' \ + > $OUTPUT/out/`basename $f .dat|sed "s/iperf3/netperf/g"`-rev-out.dat +done # end of ${DIR} + + +PSIZE_XTICS="('64' 0, '128' 2, '256' 4, '512' 6, '1024' 8, '1500' 10, '65507' 12)" +PAT_NATIVE='fill patter 2 lc rgb "red"' +PAT_MMU='fill patter 2 lc rgb "green"' +PAT_MMU_S='fill patter 2 lc rgb "dark-green"' +PAT_NOMMU_S='fill patter 2 lc rgb "royalblue"' +PAT_NOMMU_Z='fill patter 2 lc rgb "blue"' + +gnuplot << EndGNUPLOT +set terminal postscript color eps lw 3 "Helvetica" 24 +set output "${OUTPUT}/out/tcp-stream.eps" +#set xtics font "Helvetica,14" +set pointsize 2 +set xzeroaxis +set grid ytics + +set boxwidth 0.3 +set style fill pattern + +set size 1.0,0.9 +set key font ",18" +set key top left Left reverse +#set key above vertical maxrows 2 + +set xrange [-1:13] +set xtics ${PSIZE_XTICS} +set xlabel "Payload size (bytes)" +set yrange [-50:40] +#set ytics ('0' -10, '5' -5, '0' 0, '5' 5, '10' 10) +set ylabel "Goodput (Gbps)" offset +0.8 + + + +plot \ + '${OUTPUT}/out/native-netperf-fwd-out.dat' usin (\$0*2-0.6):(\$2/1000) w boxes $PAT_NATIVE title "native" ,\ + '${OUTPUT}/out/um-mmu-netperf-fwd-out.dat' usin (\$0*2-0.3):(\$2/1000) w boxes $PAT_MMU title "um(mmu)" ,\ + '${OUTPUT}/out/um-mmu-seccomp-netperf-fwd-out.dat' usin (\$0*2-0):(\$2/1000) w boxes $PAT_MMU_S title "um(mmu(s))" ,\ + '${OUTPUT}/out/um-nommu-seccomp-netperf-fwd-out.dat' usin (\$0*2+0.3):(\$2/1000) w boxes $PAT_NOMMU_S title "um(nommu(s))" ,\ + '${OUTPUT}/out/um-nommu-zpoline-netperf-fwd-out.dat' usin (\$0*2+0.6):(\$2/1000) w boxes $PAT_NOMMU_Z title "um(nommu(z))" ,\ + '${OUTPUT}/out/native-netperf-rev-out.dat' usin (\$0*2-0.6):(\$2*-1/1000) w boxes $PAT_NATIVE notitle ,\ + '${OUTPUT}/out/um-mmu-netperf-rev-out.dat' usin (\$0*2-0.3):(\$2*-1/1000) w boxes $PAT_MMU notitle ,\ + '${OUTPUT}/out/um-mmu-seccomp-netperf-rev-out.dat' usin (\$0*2-0):(\$2*-1/1000) w boxes $PAT_MMU_S notitle ,\ + '${OUTPUT}/out/um-nommu-seccomp-netperf-rev-out.dat' usin (\$0*2+0.3):(\$2*-1/1000) w boxes $PAT_NOMMU_S notitle ,\ + '${OUTPUT}/out/um-nommu-zpoline-netperf-rev-out.dat' usin (\$0*2+0.6):(\$2*-1/1000) w boxes $PAT_NOMMU_Z notitle + + +set terminal png lw 3 14 crop +set key font ",12" +set size 1.0,1.0 +set ylabel "Goodput (Gbps)" offset +0.5 +set output "${OUTPUT}/out/tcp-stream.png" +replot + + +#set terminal dumb +#unset key +#unset output +#replot + +quit +EndGNUPLOT + + + +echo "" +echo -e "### netperf bench (TCP_STREAM) (Mbps)\n" +echo -e "| psize | native | um |um-mmu(s) | um-nommu(s) | um-nommu(z)|\n|--|--|--|--|--|--|" +join $OUTPUT/out/native-netperf-fwd-out.dat $OUTPUT/out/um-mmu-netperf-fwd-out.dat \ + | join - $OUTPUT/out/um-mmu-seccomp-netperf-fwd-out.dat \ + | join - $OUTPUT/out/um-nommu-seccomp-netperf-fwd-out.dat \ + | join - $OUTPUT/out/um-nommu-zpoline-netperf-fwd-out.dat \ + | sed "s/ /\|/g" | sed "s/^/\|/" | sed "s/$/\|/" + +echo "" +echo -e "### netperf bench (TCP_MAERTS) (Mbps)" +echo -e "| psize | native | um |um-mmu(s) | um-nommu(s) | um-nommu(z)|\n|--|--|--|--|--|--|" +join $OUTPUT/out/native-netperf-rev-out.dat $OUTPUT/out/um-mmu-netperf-rev-out.dat \ + | join - $OUTPUT/out/um-mmu-seccomp-netperf-rev-out.dat \ + | join - $OUTPUT/out/um-nommu-seccomp-netperf-rev-out.dat \ + | join - $OUTPUT/out/um-nommu-zpoline-netperf-rev-out.dat \ + | sed "s/ /\|/g" | sed "s/^/\|/" | sed "s/$/\|/" diff --git a/.github/workflows/noop.c b/.github/workflows/noop.c new file mode 100644 index 00000000000000..23d5b86bec1197 --- /dev/null +++ b/.github/workflows/noop.c @@ -0,0 +1,38 @@ +#include "syscall_nr.h" + +static unsigned long __sysinfo = 0; +#include "syscall-x86_64.h" + +typedef unsigned long size_t; + +#define AUX_CNT 64 +#define DYN_CNT 32 + +#define AT_NULL 0 +#define AT_SYSINFO 32 + +void _start_c(size_t *sp) +{ + size_t i, aux[AUX_CNT], dyn[DYN_CNT]; + size_t *rel, rel_size, base; + + int argc = *sp; + char **argv = (void *)(sp+1); + + for (i=argc+1; argv[i]; i++); + size_t *auxv = (void *)(argv+i+1); + + for (i=0; i +#include +#include +#include +#include + +static void segv_handler (int cause, siginfo_t * info, void *uap) +{ + ucontext_t *context = uap; + //For test. Never ever call stdio functions in a signal handler otherwise*/ + printf ("SIGSEGV handled (mcontext=0x%lx)\n", (unsigned long)context->uc_mcontext.gregs); + + printf ("SIGSEGV raised at address 0x%lx\n", (unsigned long)context->uc_mcontext.gregs[REG_RIP]); + /*On my particular system, compiled with gcc -O2, the offending instruction + generated for "*f = 16;" is 6 bytes. Lets try to set the instruction + pointer to the next instruction (general register 14 is EIP, on linux x86) */ + context->uc_mcontext.gregs[14] += 14; + //alternativly, try to jump to a "safe place" + //context->uc_mcontext.gregs[14] = (unsigned int)safe_func; + exit(cause); +} + +/* XXX: this code doesn't work as 2nd fprintf() causes SEGV + * (probably) due to alignment issue of xmm0/rsp register + * + * https://stackoverflow.com/questions/5397041/getting-the-saved-instruction-pointer-address-from-a-signal-handler + */ +static void signal_segv(int signum, siginfo_t *info, void *ptr) +{ + static const char *si_codes[3] = {"", "SEGV_MAPERR", "SEGV_ACCERR"}; + int i, f = 0; + ucontext_t *ucontext = (ucontext_t*)ptr; + void **bp = 0; + void *ip = 0; + + fprintf(stderr, "Segmentation Fault!\n"); + fprintf(stderr, "info.si_signo = %d\n", signum); + fprintf(stderr, "info.si_errno = %d\n", info->si_errno); + fprintf(stderr, "info.si_code = %d (%s)\n", info->si_code, si_codes[info->si_code]); + fprintf(stderr, "info.si_addr = %p\n", info->si_addr); + for(i = 0; i < NGREG; i++) + fprintf(stderr, "reg[%02d] = 0x%016llx\n", i, ucontext->uc_mcontext.gregs[i]); + + ucontext->uc_mcontext.gregs[14] += 14; + exit(signum); +} + + +__asm__ ( +".section .text.nolibc_memmove_memcpy\n" +".weak nolibc_memmove\n" +".weak nolibc_memcpy\n" +"nolibc_memmove:\n" +"nolibc_memcpy:\n" + "movq %rdx, %rcx\n\t" + "movq %rdi, %rax\n\t" + "movq %rdi, %rdx\n\t" + "subq %rsi, %rdx\n\t" + "cmpq %rcx, %rdx\n\t" + "jb 1f\n\t" + "rep movsb\n\t" + "retq\n" +"1:" /* backward copy */ + "leaq -1(%rdi, %rcx, 1), %rdi\n\t" + "leaq -1(%rsi, %rcx, 1), %rsi\n\t" + "std\n\t" + "rep movsb\n\t" + "cld\n\t" + "retq\n" + +".section .text.nolibc_memset\n" +".weak memset\n" +"memset:\n" + "xchgl %eax, %esi\n\t" + "movq %rdx, %rcx\n\t" + "pushq %rdi\n\t" + "rep stosb\n\t" + "popq %rax\n\t" + "retq\n" +); + +void *nolibc_memcpy(void *dest, const void *src, size_t n); + +/* from arch/x86/boot/compressed/string.c */ +static void *____memcpy(void *dest, const void *src, size_t n) +{ + long d0, d1, d2; + asm volatile( + "rep movsq\n\t" + "movq %4,%%rcx\n\t" + "rep movsb" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) + : "memory"); + + return dest; +} + +static inline void do_host_sigsegv(void) +{ + char *ptr = NULL; +// ____memcpy(ptr, 0, 8); + nolibc_memcpy(ptr, 0, 4); +// *ptr = '1'; +} + +int main (int argc, char *argv[]) +{ + char *ptr = NULL; + struct sigaction sa; + int *f = NULL; + + if (argc != 3) { + printf("%s [nullptr or raise] [handler or not]\n", argv[0]); + return 0; + } + + if (atoi(argv[2]) >= 1) { + printf("register handler\n"); + if (atoi(argv[2]) == 1) + sa.sa_sigaction = segv_handler; + else + sa.sa_sigaction = signal_segv; + sigemptyset (&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + if (sigaction (SIGSEGV, &sa, 0)) { + perror ("sigaction"); + return -1; + } + } + + if (atoi(argv[1]) == 0) + do_host_sigsegv(); + else if (atoi(argv[1]) == 1) { + printf("raising signal\n"); + raise(SIGSEGV); + } + else if (atoi(argv[1]) == 2) { + int i = 1, j; + j = i / 0; + memcpy(ptr, 0, 8); + } + else { + printf("should: 0 <= argv[1] <= 2\n"); + } + return 0; +} +#else +// SPDX-License-Identifier: GPL-2.0 +/* + * Test FP register handling in userspace mcontext. + * + * Copyright (C) 2025 Intel Corporation + */ + +/* Is there a better way to *not* include bits/sigcontext.h? */ +#include +#undef __USE_MISC +//#include + +#include +#include +#include +#include +#include +#include +#include +#include +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +//#include "../../../tools/testing/selftests/kselftest.h" +#define ksft_print_msg printf +#define ksft_print_header() do {} while(0) +#define ksft_exit_fail_msg printf +#define ksft_test_result_pass printf +#define ksft_test_result_fail printf +#define ksft_set_plan +#define ksft_finished() do {} while(0) + +#define ST0_EXP_ADD 10 + +static void sighandler(int sig, siginfo_t *info, void *p) +{ + ucontext_t *uc = p; + struct _fpstate *fpstate = (void *)uc->uc_mcontext.fpregs; + + ksft_print_msg("sighandler(%d): extended_size: %d, xstate_size: %d\n", sig, + fpstate->padding[13], + fpstate->padding[16]); + +#ifdef __i386__ + fpstate->_st[0].exponent += ST0_EXP_ADD; + fpstate->_xmm[1].element[0] |= 0x01010101; + fpstate->_xmm[1].element[1] |= 0x01010101; + fpstate->_xmm[1].element[2] |= 0x01010101; + fpstate->_xmm[1].element[3] |= 0x01010101; +#else + /* Hacky way of modifying the exponent without breaking aliasing */ + fpstate->_st[0].exponent += ST0_EXP_ADD; + fpstate->_xmm[1].element[0] |= 0x01010101; + fpstate->_xmm[1].element[1] |= 0x01010101; + fpstate->_xmm[1].element[2] |= 0x01010101; + fpstate->_xmm[1].element[3] |= 0x01010101; +#endif + + if (sig == SIGSEGV || sig == SIGFPE) + exit(sig); +} + +static void do_self_sigusr1(uint32_t **sse, double *num) +{ + long ret; + + /* + * This does kill(getpid(), SIGUSR1); with "num" being passed in AND + * out of the floating point stack. We can therefore modify num by + * changing st[0] when handling the signal. + */ +#ifdef __i386__ + asm volatile ( + "movups %1, %%xmm1;" + "int $0x80;" + "movups %%xmm1, %1;" + : "=t" (*num), "=m" (*sse), "=a" (ret) + : "0" (*num), "2" (__NR_kill), "b" (getpid()), "c" (SIGUSR1) : + "xmm1", "memory"); +#else + asm volatile ( + "movups %1, %%xmm1;" + "syscall;" + "movups %%xmm1, %1;" + : "=t" (*num), "=m"(*sse), "=a" (ret) + : "0" (*num), "2" (__NR_kill), "D" (getpid()), "S" (SIGUSR1) + : "r11", "rcx", "xmm1", "memory"); +#endif +} + + +static inline void do_host_sigfpe(uint32_t **sse, double *num) +{ + char *ptr = NULL; + int i = 1, j; + j = i / 0; +} + +static int test_mcontext(int xmm_should_change, void(*func)(uint32_t **, double *)) +{ + double num = 0.5; + uint32_t sse[4] = {0x11223344, 0x55667788, 0x99aabbcc, 0xddeeff00 }; + int xmm_manipulated; + + ksft_print_msg("pre-signal: %d / 100, %08x %08x %08x %08x\n", (int) (100*num), sse[0], sse[1], sse[2], sse[3]); + + func((uint32_t **)&sse, &num); + + if (sse[0] == 0x11223344 || sse[1] == 0x55667788 || sse[2] == 0x99aabbcc || sse[3] == 0xddeeff00) + xmm_manipulated = 0; + else if (sse[0] == 0x11233345 || sse[1] == 0x55677789 || sse[2] == 0x99abbbcd || sse[3] == 0xddefff01) + xmm_manipulated = 1; + else + xmm_manipulated = 2; + + ksft_print_msg("post-signal: %d / 100, %08x %08x %08x %08x (should change: %d, changed: %d)\n", + (int) (100 * num), sse[0], sse[1], sse[2], sse[3], xmm_should_change, xmm_manipulated); + + if (num != (1 << (ST0_EXP_ADD - 1))) { + ksft_print_msg("floating point register was not manipulated\n"); + return 1; + } + + if (xmm_manipulated != xmm_should_change) { + ksft_print_msg("xmm/sse had unexpected value!\n"); + return 1; + } + + return 0; +} + +static int test_mcontext2(int xmm_should_change) +{ + long ret; + double num = 0.5; + uint32_t sse[4] = {0x11223344, 0x55667788, 0x99aabbcc, 0xddeeff00 }; + int xmm_manipulated; + + ksft_print_msg("pre-signal: %d / 100, %08x %08x %08x %08x\n", (int) (100*num), sse[0], sse[1], sse[2], sse[3]); + + /* + * This does kill(getpid(), SIGUSR1); with "num" being passed in AND + * out of the floating point stack. We can therefore modify num by + * changing st[0] when handling the signal. + */ +#ifdef __i386__ + asm volatile ( + "movups %1, %%xmm1;" + "int $0x80;" + "movups %%xmm1, %1;" + : "=t" (num), "=m" (sse), "=a" (ret) + : "0" (num), "2" (__NR_kill), "b" (getpid()), "c" (SIGUSR1) : + "xmm1", "memory"); +#else + asm volatile ( + "movups %1, %%xmm1;" + "syscall;" + "movups %%xmm1, %1;" + : "=t" (num), "=m"(sse), "=a" (ret) + : "0" (num), "2" (__NR_kill), "D" (getpid()), "S" (SIGUSR1) + : "r11", "rcx", "xmm1", "memory"); +#endif + + if (sse[0] == 0x11223344 || sse[1] == 0x55667788 || sse[2] == 0x99aabbcc || sse[3] == 0xddeeff00) + xmm_manipulated = 0; + else if (sse[0] == 0x11233345 || sse[1] == 0x55677789 || sse[2] == 0x99abbbcd || sse[3] == 0xddefff01) + xmm_manipulated = 1; + else + xmm_manipulated = 2; + + ksft_print_msg("post-signal: %d / 100, %08x %08x %08x %08x (should change: %d, changed: %d)\n", + (int) (100 * num), sse[0], sse[1], sse[2], sse[3], xmm_should_change, xmm_manipulated); + + if (num != (1 << (ST0_EXP_ADD - 1))) { + ksft_print_msg("floating point register was not manipulated\n"); + return 1; + } + + if (xmm_manipulated != xmm_should_change) { + ksft_print_msg("xmm/sse had unexpected value!\n"); + return 1; + } + + return 0; +} + +static void my_sa_restorer(void) +{ + syscall(__NR_rt_sigreturn); +} + +#define _NSIG 65 +#define SIGPT_SET \ + ((sigset_t *)(const unsigned long [_NSIG/8/sizeof(long)]){ \ + [sizeof(long)==4] = 3UL<<(32*(sizeof(long)>4)) }) + +struct k_sigaction { + void (*handler)(int); + unsigned long flags; + void (*restorer)(void); + unsigned mask[2]; +}; + +static int my_sigaction(int signum, struct sigaction *act, struct sigaction *oldact) +{ + struct k_sigaction ksa, oksa; + + ksa.handler = act->sa_handler; + ksa.flags = act->sa_flags; + ksa.flags |= SA_RESTORER; + ksa.restorer = my_sa_restorer; + memcpy(&ksa.mask, &act->sa_mask, _NSIG/8); + + return syscall(__NR_rt_sigaction, signum, &ksa, oldact ? & oksa : 0, _NSIG/8); +} + +int main(int argc, char *argv[]) +{ + struct sigaction sa = { + .sa_flags = SA_SIGINFO, + .sa_handler = (void (*)(int))sighandler, + .sa_mask = 0, + }; + + ksft_print_header(); + ksft_set_plan(1); + + if (atoi(argv[2]) >= 1) { + printf("register handler\n"); + if (my_sigaction(SIGUSR1, &sa, NULL) < 0) { + perror ("sigaction"); + return -1; + } + if (my_sigaction(SIGSEGV, &sa, NULL) < 0) { + perror ("sigaction"); + return -1; + } + if (my_sigaction(SIGFPE, &sa, NULL) < 0) { + perror ("sigaction"); + return -1; + } + } + + if (atoi(argv[1]) == 0) + test_mcontext(1, do_host_sigsegv); + else if (atoi(argv[1]) == 1) { + printf("raising signal\n"); + test_mcontext2(1); + //raise(SIGSEGV); + } + else if (atoi(argv[1]) == 2) { + test_mcontext(1, do_host_sigfpe); + } + else { + printf("should: 0 <= argv[1] <= 2\n"); + } + + ksft_finished(); +} + +#endif diff --git a/.github/workflows/pthread.c b/.github/workflows/pthread.c new file mode 100644 index 00000000000000..f56e135c4c2fee --- /dev/null +++ b/.github/workflows/pthread.c @@ -0,0 +1,84 @@ +#include +#include +#include +#include + +/* this function is run by the second thread */ +void *inc_x(void *x_void_ptr) +{ + /* Allow this thread to be cancelled even if it's in a syscall */ + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + printf("set cancel\n"); + + /* increment x to 100 */ + int *x_ptr = (int *)x_void_ptr; + + while(++(*x_ptr) < 100); + + printf("x increment finished\n"); + + /* the function must return something - NULL will do */ + return NULL; + +} + +#define NUM_LOOP 1000 +int main(int argc, char *argv[]) +{ + + int x = 0, y = 0, i = 0, num_loop = NUM_LOOP; + + /* show the initial values of x and y */ + printf("x: %d, y: %d\n", x, y); + + /* this variable is our reference to the second thread */ + pthread_t inc_x_thread; + + if (argc > 1) + num_loop = atoi(argv[1]); + + while (i < num_loop) { + /* create a second thread which executes inc_x(&x) */ + if(pthread_create(&inc_x_thread, NULL, inc_x, &x)) { + + fprintf(stderr, "Error creating thread\n"); + return 1; + + } +#if 0 + /* XXX: sometimes doesn't work */ + if(pthread_setname_np(inc_x_thread, "child-p")) { + fprintf(stderr, "%p: Error setting thread name (%d)\n", inc_x_thread, errno); + return 1; + } +#endif + + /* increment y to 100 in the first thread */ + while(++y < 100); + //sleep(1); + printf("y increment finished\n"); + + if (argc > 2) { + if(pthread_cancel(inc_x_thread)) { + fprintf(stderr, "Error canceling thread\n"); + return 3; + } + } + + /* wait for the second thread to finish */ + if(pthread_join(inc_x_thread, NULL)) { + + fprintf(stderr, "Error joining thread\n"); + return 2; + + } + + /* show the results - x is now 100 thanks to the second thread */ + printf("x: %d, y: %d\n", x, y); + i++; + } + + return 0; + +} diff --git a/.github/workflows/rcS b/.github/workflows/rcS new file mode 100755 index 00000000000000..a7efe02c4c20c9 --- /dev/null +++ b/.github/workflows/rcS @@ -0,0 +1,16 @@ +#!/bin/sh + +NIC=vec0 +mount proc /proc -t proc +echo "nameserver 8.8.8.8" > /etc/resolv.conf +/sbin/ifconfig lo 127.0.0.1 up +/sbin/ifconfig $NIC 192.168.122.2 up +/sbin/ip route add default via 192.168.122.1 dev $NIC + +export PATH=/home:/sbin:/usr/sbin:/bin:/usr/bin +export TERM=linux + +# hanging +#export PS1="\[$(tput bold)\]\[$(tput setaf 4)\][\[$(tput setaf 5)\]\u\[$(tput setaf 4)\]@\[$(tput setaf 5)\]\h \[$(tput setaf 2)\]\W\[$(tput setaf 4)\]]\\$ \[$(tput sgr0)\]" + +exec "$@" diff --git a/.github/workflows/stackp.c b/.github/workflows/stackp.c new file mode 100644 index 00000000000000..df058766eecf69 --- /dev/null +++ b/.github/workflows/stackp.c @@ -0,0 +1,15 @@ +#include +#include +#include +#include +#include +#include +#include + + +int main(int argc, char *argv[]) +{ + char str[64]; + + write(2, "Done\nChild\n", 11); +} diff --git a/.github/workflows/test-signal-restore.c b/.github/workflows/test-signal-restore.c new file mode 100644 index 00000000000000..6808abd1a488d7 --- /dev/null +++ b/.github/workflows/test-signal-restore.c @@ -0,0 +1,386 @@ +/* + * gcc test-signal-restore.c -o test-signal-restore-amd64 + * gcc -m32 -march=i686 -lm test-signal-restore.c -o test-signal-restore-i386 + */ + +/* Is there a better way to *not* include bits/sigcontext.h? */ +#include +#undef __USE_MISC +//#include + +#include +#include +#include +#include +#include +#include +#include +#include +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#define ST0_EXP_ADD 10 + +void *scratch_page; + +void sighandler(int sig, siginfo_t *info, void *p) +{ + ucontext_t *uc = p; + +#if 0 + printf("sighandler: extended_size: %d, xstate_size: %d\n", + ((struct _fpstate *)uc->uc_mcontext.fpregs)->sw_reserved.extended_size, + ((struct _fpstate *)uc->uc_mcontext.fpregs)->sw_reserved.xstate_size); +#endif + uc->uc_mcontext.fpregs->_st[0].exponent += ST0_EXP_ADD; + + printf("%d: sh\n", gettid()); +} + +int test_fp(double num) +{ + long ret; + double orig_num = num; + + printf("%d: pre-signal: %g\n", gettid(), num); + /* + * This does kill(getpid(), SIGUSR1); with "num" being passed in AND + * out of the floating point stack. We can therefore modify num by + * changing st[0] when handling the signal. + */ +#ifdef __i386__ + asm volatile ( + "int $0x80;" + : "=t" (num), "=a" (ret) + : "0" (num), "1" (__NR_kill), "b" (getpid()), "c" (SIGUSR1) : ); +#else + asm volatile ( + "syscall;" + : "=t" (num), "=a" (ret) + : "0" (num), "1" (__NR_kill), "D" (getpid()), "S" (SIGUSR1) : "r11", "rcx"); +#endif + sleep(1); + printf("%d: post-signal: %g\n", gettid(), num); + + if (num != (pow(2, ST0_EXP_ADD) * orig_num)) { + printf("%d: floating point register was not manipulated\n", gettid()); + return 1; + } + + return 0; +} + +enum source { + S_FPREGS = 0, + S_FPXREGS = 1, + S_GETREGS_FPREGS = 2, + S_GETREGS_XFPREGS = 3, + S_GETREGS_XSTATE = 4, +}; + +static void *thread1(void *arg) +{ + int val = (intptr_t)arg; + + for (int i = 0; i < 2; i++ ){ + double num = 0.5 + val; + double orig_num = num; + +// printf("%s: >> num: %f tid=%d\n", __func__, num, gettid()); + sleep(3); + + num = pow(2, ST0_EXP_ADD) * num; + printf("%s: << orig_num: %f num: %f tid=%d\n", __func__, orig_num, num, gettid()); + + if (pow(2, ST0_EXP_ADD) * orig_num != num) { + abort(); + return (void *)-1; + } + } + + return NULL; +} + +int test_fp_pthread(enum source source) +{ + pthread_t th[10]; + int ret = 0; + long arg = 0; + + for (int i = 0; i < sizeof(th)/sizeof(pthread_t); i++) { + if(pthread_create(&th[i], NULL, thread1, (void *)arg++)) { + perror("pthread_create"); + return 1; + } + } +#if 0 + if (test_fp(0.9)) { + perror("test_fp_pthread 2nd"); + } +#endif + for (int i = 0; i < sizeof(th)/sizeof(pthread_t); i++) { + void *retval; + if(pthread_join(th[i], &retval)) { + perror("pthread_join"); + return 1; + } + + if ((intptr_t)retval == -1) + ret = -1; + + } + + return ret; +} + +int test_fp_ptrace(enum source source) +{ +#if 0 + int pid, status, ret; + + pid = vfork(); + if (pid < 0) + return 127; + + if (pid == 0) { + /* child */ + ptrace(PTRACE_TRACEME, 0, 0, 0); + kill(getpid(), SIGSTOP); + + if (test_fp(0.5)) + exit(1); + + exit(0); + } + + /* Wait for child to stop itself */ + do { + ret = waitpid(pid, &status, 0); + } while (ret < 0 && errno == EINTR); + if (!WIFSTOPPED(status)) + return 127; + + /* Continue until SIGUSR1 to self */ + ptrace(PTRACE_CONT, pid, NULL, 0); + do { + ret = waitpid(pid, &status, 0); + } while (ret < 0 && errno == EINTR); + if (!WIFSTOPPED(status)) + return 127; + + if (source == S_FPXREGS || source == S_GETREGS_XFPREGS) { +#ifdef __i386__ + struct user_fpxregs_struct *fpstate; + struct iovec iov = { + .iov_len = sizeof(*fpstate), + }; + int ret; + + fpstate = scratch_page + 4096 - iov.iov_len; + iov.iov_base = fpstate; + + if (source == S_GETREGS_XFPREGS) + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRXFPREG, &iov); + else + ret = ptrace(PTRACE_GETFPXREGS, pid, NULL, fpstate); + + if (ret) { + kill(pid, SIGKILL); + if (errno == EINVAL) { + printf("Getting FPX regs not supported\n"); + return 0; + } else { + printf("Error getting FPX regs: %d\n", errno); + return 127; + } + } + ((struct _fpxreg*)&fpstate->st_space[0])->exponent += ST0_EXP_ADD; + + if (source == S_GETREGS_XFPREGS) + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRXFPREG, &iov); + else + ret = ptrace(PTRACE_SETFPXREGS, pid, NULL, fpstate); + if (ret) + return -127; + +#else + printf("No FPXREGS on x86_64\n"); + kill(pid, SIGKILL); + return 127; +#endif + } else if (source == S_FPREGS || source == S_GETREGS_FPREGS) { + struct _fpstate *fpstate; + struct iovec iov = { + .iov_len = sizeof(*fpstate), + }; + + fpstate = scratch_page; // + 4096 - sizeof(*fpstate); + iov.iov_base = fpstate; + + if (source == S_GETREGS_FPREGS) + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + else + ret = ptrace(PTRACE_GETFPREGS, pid, NULL, fpstate); + + + if (ret) { + kill(pid, SIGKILL); + if (errno == EINVAL) { + printf("Getting FP regs not supported\n"); + return 0; + } else { + printf("Error getting FPX regs: %d\n", errno); + return 127; + } + } +#ifdef __i386__ + ((struct _fpreg*) &fpstate->_st[0])->exponent += ST0_EXP_ADD; +#else + ((struct _fpstate*) &fpstate)->_st[0].exponent += ST0_EXP_ADD; +#endif + + if (source == S_GETREGS_FPREGS) + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + else + ret = ptrace(PTRACE_SETFPREGS, pid, NULL, fpstate); + + if (ret) + return 127; + } else if (source == S_GETREGS_XSTATE) { +#ifdef __i386__ + struct user_fpxregs_struct *fpstate; +#else + struct user_fpregs_struct *fpstate; +#endif + struct iovec iov = { + .iov_len = 4096, + }; + + fpstate = scratch_page + 4096 - iov.iov_len; + iov.iov_base = fpstate; + + ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + if (ret) { + kill(pid, SIGKILL); + if (errno == EINVAL) { + printf("Getting XSTATE not supported\n"); + return 0; + } else { + printf("Error getting XSTATE size: %d\n", errno); + return 127; + } + } + + printf("host xstate size: %ld\n", iov.iov_len); + + /* Second time with the exact length (to test the kernel) */ + fpstate = scratch_page + 4096 - iov.iov_len; + iov.iov_base = fpstate; + + ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + if (ret) { + printf("Error getting XSTATE: %d\n", errno); + return 127; + } + + fpstate = scratch_page + 4096 - iov.iov_len; + iov.iov_base = fpstate; + + ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + if (ret) { + kill(pid, SIGKILL); + printf("Error getting XSTATE (with correct size): %d\n", errno); + return 127; + } + +#ifdef __i386__ + ((struct _fpxreg *)&fpstate->st_space[0])->exponent += ST0_EXP_ADD; +#else + ((struct _fpstate *)&fpstate)->_st[0].exponent += ST0_EXP_ADD; +#endif + + ret = ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); + if (ret) { + printf("Failed to set XSTATE: %d\n", errno); + return 127; + } + + } else { + return 127; + } + + /* Run until completion (without handling the signal) */ + ptrace(PTRACE_CONT, pid, NULL, 0); + do { + ret = waitpid(pid, &status, 0); + } while (ret < 0 && errno == EINTR); + + if (!WIFEXITED(status)) + return 127; + + return WEXITSTATUS(status); +#endif +} + +int main() +{ + struct sigaction sa = { + .sa_flags = SA_SIGINFO, + .sa_handler = (void (*)(int))sighandler, + }; + int ret; + + scratch_page = mmap(NULL, 8192, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(scratch_page + 4096, 4096); + + sigaction(SIGUSR1, &sa, NULL); + + if (test_fp(0.5)) + return 1; + + ret = test_fp_pthread(S_FPREGS); + if (ret) + return ret; + +#if 0 + sa.sa_handler = SIG_DFL; + sigaction(SIGUSR1, &sa, NULL); + + printf("\nmodify using ptrace PTRACE_SETFPREGS instead of sighandler:\n"); + ret = test_fp_ptrace(S_FPREGS); + if (ret) + return ret; + +#ifdef __i386__ + printf("\nmodify using ptrace PTRACE_SETFPXREGS instead of sighandler:\n"); + ret = test_fp_ptrace(S_FPXREGS); + if (ret) + return ret; +#endif + + + printf("\nmodify using ptrace PTRACE_SETREGSET, via NT_PRFPREG instead of sighandler:\n"); + ret = test_fp_ptrace(S_GETREGS_FPREGS); + if (ret) + return ret; + +#ifdef __i386__ + printf("\nmodify using ptrace PTRACE_SETREGSET, via NT_XFPREGS instead of sighandler:\n"); + ret = test_fp_ptrace(S_GETREGS_XFPREGS); + if (ret) + return ret; +#endif + + printf("\nmodify using ptrace PTRACE_SETREGSET, via NT_X86_XSTATE instead of sighandler:\n"); + ret = test_fp_ptrace(S_GETREGS_XSTATE); + if (ret) + return ret; + +#endif + return 0; +} diff --git a/.github/workflows/um-nommu-bench.sh b/.github/workflows/um-nommu-bench.sh new file mode 100755 index 00000000000000..601e128ccbb9f5 --- /dev/null +++ b/.github/workflows/um-nommu-bench.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +SCRIPT_DIR="$(cd $(dirname "${BASH_SOURCE[0]}"); pwd)" + +# macos has different syntax +OUTPUT=$SCRIPT_DIR/$(date "+%Y-%m-%d") +mkdir -p $OUTPUT + +TRIALS="1" +ENTRIES="1000000" +VSIZES="1 8 256 1024 8192" +RUNTIMES="build/mmu build" + + +echo "$(tput bold)== um (mmu) ($test-$num-$vsize) ==$(tput sgr0)" +../linux-um-nommu/build-mmu/vmlinux ubd0=./alpine-test.ext3 rw mem=1024m loglevel=0 init=/bench.sh \ + | tee "$OUTPUT/um-mmu.dat" + +echo "$(tput bold)== um (nommu) ($test-$num-$vsize) ==$(tput sgr0)" +../linux-um-nommu/build/vmlinux ubd0=./alpine-test.ext3 rw mem=1024m loglevel=0 init=/bench.sh \ + | tee "$OUTPUT/um-nommu.dat" + +echo "$(tput bold)== host (mmu) ($test-$num-$vsize) ==$(tput sgr0)" +sh docker/alpine/lmbench_run.sh \ + |& tee "$OUTPUT/native.dat" +./zpoline-bench/do_getpid -c 100 | tee -a "$OUTPUT/native.dat" + +bash ${SCRIPT_DIR}/um-nommu-plot.sh ${OUTPUT} diff --git a/.github/workflows/um-nommu-plot.sh b/.github/workflows/um-nommu-plot.sh new file mode 100755 index 00000000000000..1abe04b985c336 --- /dev/null +++ b/.github/workflows/um-nommu-plot.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" + +OUTPUT="$1" +DIRS="read fill" + +mkdir -p "$OUTPUT/out/" + +# parse outputs + +for f in `ls $OUTPUT/*.dat |grep -v iperf3` +do + cat $f | grep -a microsec | sed "s/.*:\(.*\)/\1/" | awk '{print $1}' \ + > $OUTPUT/out/`basename $f .dat`-lmbench-out.dat + + cat $f | grep -a average |grep -v time | awk '{print $2 $3}' \ + > $OUTPUT/out/`basename $f .dat`-getpid-out.dat +done + +# parse iperf3 result +for f in `ls $OUTPUT/*-iperf3.dat` +do + cat $f | grep receiver | awk '{print $7}' \ + > $OUTPUT/out/`basename $f .dat`-out.dat +done + +gnuplot << EndGNUPLOT +set terminal postscript eps lw 3 "Helvetica" 24 +set output "${OUTPUT}/out/lmbench.eps" +#set xtics font "Helvetica,14" +set pointsize 2 +set xzeroaxis +set grid ytics + +set boxwidth 0.2 +set style fill pattern + +set size 1.0,0.8 +set key top left + +set xrange [-0.5:10] +set xtics ('select-10' 0, 'select-100' 1, 'select-1000' 2, 'syscall' 3, 'read' 4, 'write' 5, 'stat' 6, 'open/close' 7, 'fork+sh' 8, 'fork+execve' 9) +set xtics rotate by 45 right +set yrange [0.01:100000] +set ylabel "Latency (usec)" +set logscale y + +plot \ + '${OUTPUT}/out/native-lmbench-out.dat' usin (\$0-0.4):(\$1) w boxes fill patter 2 lt 1 lc rgb "red" title "native", \ + '${OUTPUT}/out/um-mmu-lmbench-out.dat' usin (\$0-0.2):(\$1) w boxes fill patter 2 lt 1 lc rgb "green" title "um(mmu)" ,\ + '${OUTPUT}/out/um-mmu-seccomp-lmbench-out.dat' usin (\$0-0):(\$1) w boxes fill patter 2 lt 1 lc rgb "dark-green" title "um(mmu(s))" ,\ + '${OUTPUT}/out/um-nommu-seccomp-lmbench-out.dat' usin (\$0+0.2):(\$1) w boxes fill patter 2 lt 1 lc rgb "royalblue" title "um(nommu(s))" ,\ + '${OUTPUT}/out/um-nommu-zpoline-lmbench-out.dat' usin (\$0+0.4):(\$1) w boxes fill patter 2 lt 1 lc rgb "blue" title "um(nommu(z))" + +set terminal png lw 3 14 crop +set output "${OUTPUT}/out/lmbench.png" +replot + +EndGNUPLOT + +echo -e "### lmbench (usec)\n" +echo -e "|select-10\n|select-100\n|select-1000\n|syscall\n|read\n|write\n|stat\n|open/close\n|fork+sh\n|fork+execve" > /tmp/a + +echo -e "||native|um|um-mmu(s)|um-nommu(s)|um-nommu(z)|\n|--|--|--|--|--|--|"; paste -d "|" `ls ${OUTPUT}/out/*-lmbench-out.dat` | sed "s/\(.*\)/\|\1\|/" | paste /tmp/a - | column -t + +rm -f /tmp/a + +echo "" +echo -e "### do_getpid bench (nsec)\n" +for f in `ls $OUTPUT/*.dat |grep -v iperf3` +do + export $(basename $f .dat|sed "s/-/_/g")=`grep -a aver $f | grep -v time | awk '{print $2}'` +done +echo -e "||native|um|um-mmu(s)|um-nommu(s)|um-nommu(z)|\n|--|--|--|--|--|--|" +echo "|getpid | ${native} | ${um_mmu} | ${um_mmu_seccomp} | ${um_nommu_seccomp}| ${um_nommu_zpoline}|" + + +# iperf result + +gnuplot << EndGNUPLOT +set terminal postscript eps lw 3 "Helvetica" 24 +set output "${OUTPUT}/out/iperf3.eps" +#set xtics font "Helvetica,14" +set pointsize 2 +set xzeroaxis + +set boxwidth 0.2 +set style fill pattern + +set size 1.0,0.8 +set key top left + +set xrange [-0.5:] +set xtics ('iperf(f)' 0, 'iperf(r)' 1) +#set xtics rotate by 45 right +set yrange [:50] +set ytics 10 +set ylabel "Goodput (Gbps)" + +plot \ + '${OUTPUT}/out/native-iperf3-out.dat' usin (\$0-0.4):(\$1/1000) w boxes fill patter 2 lt 1 lc rgb "red" title "native",\ + '${OUTPUT}/out/um-mmu-iperf3-out.dat' usin (\$0-0.2):(\$1/1000) w boxes fill patter 2 lt 1 lc rgb "green" title "um(mmu)" ,\ + '${OUTPUT}/out/um-mmu-seccomp-iperf3-out.dat' usin (\$0-0):(\$1/1000) w boxes fill patter 2 lt 1 lc rgb "dark-green" title "um(mmu(s))" ,\ + '${OUTPUT}/out/um-nommu-seccomp-iperf3-out.dat' usin (\$0+0.2):(\$1/1000) w boxes fill patter 2 lt 1 lc rgb "royalblue" title "um(nommu(s))" ,\ + '${OUTPUT}/out/um-nommu-zpoline-iperf3-out.dat' usin (\$0+0.4):(\$1/1000) w boxes fill patter 2 lt 1 lc rgb "blue" title "um(nommu(z))" + + +set terminal png lw 3 14 crop +set output "${OUTPUT}/out/iperf3.png" +replot + +EndGNUPLOT + +echo "" +echo -e "### iperf3 bench (Mbps)\n" +for f in `ls $OUTPUT/out/*iperf3*.dat` +do + export $(basename $f .dat|sed "s/-/_/g" | sed "s/_iperf3_out/_f/")=`cat $f|awk NR==1` + export $(basename $f .dat|sed "s/-/_/g" | sed "s/_iperf3_out/_r/")=`cat $f|awk NR==2` + + +done +echo -e "||native|um|um-mmu(s)|um-nommu(s)|um-nommu(z)|\n|--|--|--|--|--|--|" +echo "|iperf3(f) | ${native_f} | ${um_mmu_f} | ${um_mmu_seccomp_f} | ${um_nommu_seccomp_f}| ${um_nommu_zpoline_f}|" +echo "|iperf3(r) | ${native_r} | ${um_mmu_r} | ${um_mmu_seccomp_r} | ${um_nommu_seccomp_r}| ${um_nommu_zpoline_r}|" diff --git a/.github/workflows/vdso_test.c b/.github/workflows/vdso_test.c new file mode 100644 index 00000000000000..f5891cfe5445e2 --- /dev/null +++ b/.github/workflows/vdso_test.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include + + +int main(int argc, char* const* argv) +{ + int ch; + unsigned long loopcnt = 0; + + while ((ch = getopt(argc, argv, "c:")) != -1) { + switch (ch) { + case 'c': + loopcnt = atol(optarg); + break; + + default: + printf("unknown option\n"); + exit(1); + } + } + + if (!loopcnt) { + printf("please specify loop count by -c\n"); + exit(0); + } + + unsigned long t; + struct timespec ts; + struct timeval tv; + + unsigned long i, j; + for (j = 0; j < 2; j++) { + clock_gettime(CLOCK_REALTIME, &ts); + t = ts.tv_sec * 1000000000UL + ts.tv_nsec; + + for (i = 0; i < loopcnt; i++) { + if (j == 0) + clock_gettime(CLOCK_REALTIME, &ts); + else + gettimeofday(&tv, NULL); + } + + clock_gettime(CLOCK_REALTIME, &ts); + t = ts.tv_sec * 1000000000UL + ts.tv_nsec - t; + printf("average[%s] %5lu nsec\n", j == 0 ? "clock_gettime" + : "gettimeofday", t / loopcnt); + } + + + return 0; +} diff --git a/Documentation/dev-tools/kunit/api/index.rst b/Documentation/dev-tools/kunit/api/index.rst index 5cdb552a0808f2..34d8fee9a97059 100644 --- a/Documentation/dev-tools/kunit/api/index.rst +++ b/Documentation/dev-tools/kunit/api/index.rst @@ -9,6 +9,7 @@ API Reference test resource functionredirection + uapi clk of platformdevice @@ -32,6 +33,10 @@ Documentation/dev-tools/kunit/api/functionredirection.rst - Documents the KUnit Function Redirection API +Documentation/dev-tools/kunit/api/uapi.rst + + - Documents the KUnit Userspace testing API + Driver KUnit API ================ diff --git a/Documentation/dev-tools/kunit/api/uapi.rst b/Documentation/dev-tools/kunit/api/uapi.rst new file mode 100644 index 00000000000000..1f01b5c6c9db42 --- /dev/null +++ b/Documentation/dev-tools/kunit/api/uapi.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +Userspace Test API +================== + +This file documents all of the userspace testing API. +Userspace tests are built as :ref:`kbuild userprogs `, +linked statically and without any external dependencies. + +For the widest platform compatibility they should use nolibc, as provided by `init/Makefile.nolibc`. + +.. kernel-doc:: include/kunit/uapi.h + :internal: diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 24a4708d26e8ef..6e54ad4cfecbf7 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -891,6 +891,8 @@ This is possible in two ways: This will tell kbuild to build lxdialog even if not referenced in any rule. +.. _kbuild_userprogs: + Userspace Program support ========================= diff --git a/Documentation/virt/uml/nommu-uml.rst b/Documentation/virt/uml/nommu-uml.rst new file mode 100644 index 00000000000000..e9f238a5e5806d --- /dev/null +++ b/Documentation/virt/uml/nommu-uml.rst @@ -0,0 +1,181 @@ +.. SPDX-License-Identifier: GPL-2.0 + +UML has been built with CONFIG_MMU since day 0. The patchset +introduces the nommu mode on UML in a different angle in order to +cover broader code to be executed over UML. + +.. contents:: :local: + +What is it for ? +================ + +- To exercises nommu code over UML (and over KUnit) +- Reduce syscall latency triggered with the interactions of multiple + processes + + +How it works ? +============== + +To illustrate how this feature works, the below shows how syscalls are +called under nommu/UML environment. + +- boot kernel, install seccomp filter if ``syscall`` instructions are + called from userspace memory based on the address of instruction + pointer +- (userspace starts) +- calls ``vfork``/``execve`` syscalls +- ``SIGSYS`` signal raised, handler calls syscall entry point ``__kernel_vsyscall`` +- call handler function in ``sys_call_table[]`` and follow how UML syscall + works +- return to userspace + + +What are the differences from MMU-full UML ? +============================================ + +The current nommu implementation adds 3 different functions which +MMU-full UML doesn't have: + +- kernel address space can directly be accessible from userspace + - so, ``uaccess()`` always returns 1 + - generic implementation of memcpy/strcpy/futex is also used +- alternate syscall entrypoint +- alternate syscall hook with a different seccomp filter + +With those modifications, it allows us to use unmodified userspace +binaries with nommu UML. + + +History +======= + +This feature was originally introduced by Ricardo Koller at Open +Source Summit NA 2020, then integrated with the syscall translation +functionality with the clean up to the original code. + +Building and run +================ + +:: + + make ARCH=um x86_64_nommu_defconfig + make ARCH=um + +will build UML with ``CONFIG_MMU=n`` applied. + +Kunit tests can run with the following command:: + + ./tools/testing/kunit/kunit.py run --kconfig_add CONFIG_MMU=n --kconfig_add CONFIG_KUNIT_UML_PCI=n + +To run a typical Linux distribution, we need nommu-aware userspace. +We can use a stock version of Alpine Linux with nommu-built version of +busybox and musl-libc, which are available as additional packages. + + +Preparing root filesystem +========================= + +nommu UML requires to use a specific standard library which is aware +of nommu kernel. We have tested custom-build musl-libc and busybox, +both of which have built-in support for nommu kernels. + +There are no available Linux distributions for nommu under x86_64 +architecture, so we need to prepare our own image for the root +filesystem. We use Alpine Linux as a base distribution and replace +busybox and musl-libc on top of that. The following are the step to +prepare the filesystem for the quick start:: + + container_id=$(docker create ghcr.io/thehajime/alpine:3.20.3-um-nommu) + docker start $container_id + docker wait $container_id + docker export $container_id > alpine.tar + docker rm $container_id + + mnt=$(mktemp -d) + dd if=/dev/zero of=alpine.ext4 bs=1 count=0 seek=1G + sudo chmod og+wr "alpine.ext4" + yes 2>/dev/null | mkfs.ext4 "alpine.ext4" || true + sudo mount "alpine.ext4" $mnt + sudo tar -xf alpine.tar -C $mnt + sudo umount $mnt + +This will create a file image, ``alpine.ext4``, which contains busybox +and musl with nommu build on the Alpine Linux root filesystem. The +file can be specified to the argument ``ubd0=`` to the UML command line:: + + ./vmlinux ubd0=./alpine.ext4 rw mem=1024m loglevel=8 init=/sbin/init + +We plan to upstream the apk packages for busybox and musl so that we +can follow the proper procedure to set up the root filesystem. + + +Quick start with docker +======================= + +There is a docker image that you can quickly start with a simple step:: + + docker run -it -v /dev/shm:/dev/shm --rm ghcr.io/thehajime/alpine:3.20.3-um-nommu + +This will launch a UML instance with an pre-configured root filesystem. + +Benchmark +========= + +The below shows an example of performance measurement conducted with +lmbench and (self-crafted) getpid benchmark (with v6.17-rc5 uml/next +tree). + +.. csv-table:: lmbench (usec) + :header: ,native,um,um-mmu(s),um-nommu(s),um-nommu(z) + + select-10 ,0.5319,36.1214,24.2795,2.9174,0.3585 + select-100 ,1.6019,34.6049,28.8865,3.8080,1.2597 + select-1000 ,12.2588,43.6838,48.7438,12.7872,10.2857 + syscall ,0.1644,35.0321,53.2119,2.5981,0.1568 + read ,0.3055,31.5509,45.8538,2.7068,0.2228 + write ,0.2512,31.3609,29.2636,2.6948,0.2106 + stat ,1.8894,43.8477,49.6121,3.1908,0.5278 + open/close ,3.2973,77.5123,68.9431,6.2575,0.9021 + fork+sh ,1110.3000,7359.5000,4618.6667,439.4615,19016.0000 + fork+execve ,510.8182,2834.0000,2461.1667,139.7848,4849.3333 + +.. csv-table:: do_getpid bench (nsec) + :header: ,native,um,um-mmu(s),um-nommu(s),um-nommu(z) + + getpid , 161 , 34477 , 26242 , 2599, 163 + +(um-nommu(s) is with seccomp syscall hook, um-mmu(s) is SECCOMP mode, +um-nommu(z) is nommu with zpoline syscall hook, respectively) + +Limitations +=========== + +generic nommu limitations +------------------------- +Since this port is a kernel of nommu architecture, the implementation +inherits the characteristics of other nommu kernels (riscv, arm, etc), +described below. + +- vfork(2) should be used instead of fork(2) +- ELF loader only loads PIE (position independent executable) binaries +- processes share the address space among others +- mmap(2) offers a subset of functionalities (e.g., unsupported + MAP_FIXED) + +Thus, we have limited options to userspace programs. We have tested +Alpine Linux with musl-libc, which has a support nommu kernel. + +supported architecture +---------------------- +The current implementation of nommu UML only works on x86_64 SUBARCH. +We have not tested with 32-bit environment. + + +Further readings about NOMMU UML +================================ + +- NOMMU UML (original code by Ricardo Koller) + - https://static.sched.com/hosted_files/ossna2020/ec/kollerr_linux_um_nommu.pdf +- introduction at FOSDEM 2026 + - https://fosdem.org/2026/schedule/event/HWLQDV-nommu-uml-thehajime/ diff --git a/MAINTAINERS b/MAINTAINERS index 0c01baefb9c315..d2e46f23da0df3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14276,6 +14276,15 @@ S: Maintained F: Documentation/devicetree/bindings/leds/backlight/kinetic,ktz8866.yaml F: drivers/video/backlight/ktz8866.c +KUNIT UAPI TESTING FRAMEWORK (in addition to KERNEL UNIT TESTING FRAMEWORK) +M: Thomas Weißschuh +S: Maintained +F: include/kunit/uapi.h +F: lib/kunit/kunit-example-uapi.c +F: lib/kunit/kunit-test-uapi.c +F: lib/kunit/kunit-uapi.c +F: lib/kunit/uapi-preinit.c + KVM PARAVIRT (KVM/paravirt) M: Paolo Bonzini R: Vitaly Kuznetsov @@ -18764,6 +18773,8 @@ M: Willy Tarreau M: Thomas Weißschuh S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/nolibc/linux-nolibc.git +F: init/Kconfig.nolibc +F: init/Makefile.nolibc F: tools/include/nolibc/ F: tools/testing/selftests/nolibc/ @@ -27445,6 +27456,7 @@ USER-MODE LINUX (UML) M: Richard Weinberger M: Anton Ivanov M: Johannes Berg +M: Hajime Tazaki L: linux-um@lists.infradead.org S: Maintained W: http://user-mode-linux.sourceforge.net diff --git a/Makefile b/Makefile index 13107aa0a5fb12..04d9b28e007e4f 100644 --- a/Makefile +++ b/Makefile @@ -1170,8 +1170,8 @@ ifneq ($(CONFIG_ARCH_VMLINUX_NEEDS_RELOCS),) LDFLAGS_vmlinux += --emit-relocs --discard-none endif -# Align the architecture of userspace programs with the kernel -USERFLAGS_FROM_KERNEL := --target=% +# Align the bit size, byte order and architecture of userspace programs with the kernel +USERFLAGS_FROM_KERNEL := -m32 -m64 -mlittle-endian -mbig-endian --target=% -march=% -mabi=% ifdef CONFIG_ARCH_USERFLAGS KBUILD_USERCFLAGS += $(CONFIG_ARCH_USERFLAGS) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 098cda44db2255..67e8dc3d1e0a2f 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -35,7 +35,7 @@ config UML select ARCH_SUPPORTS_LTO_CLANG_THIN select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c - select HAVE_ARCH_VMAP_STACK + select HAVE_ARCH_VMAP_STACK if MMU select HAVE_RUST select ARCH_HAS_UBSAN select HAVE_ARCH_TRACEHOOK @@ -43,9 +43,12 @@ config UML select THREAD_INFO_IN_TASK select SPARSE_IRQ select MMU_GATHER_RCU_TABLE_FREE + select UACCESS_MEMCPY if !MMU + select GENERIC_STRNLEN_USER if !MMU + select GENERIC_STRNCPY_FROM_USER if !MMU config MMU - bool + bool "MMU-based Paged Memory Management Support" if 64BIT default y config UML_DMA_EMULATION @@ -227,8 +230,15 @@ config MAGIC_SYSRQ The keys are documented in . Don't say Y unless you really know what this hack does. +config ARCH_FORCE_MAX_ORDER + int "Order of maximal physically contiguous allocations" if EXPERT + default "10" if MMU + default "16" if !MMU + config KERNEL_STACK_ORDER int "Kernel stack size order" + default 3 if !MMU + range 3 10 if !MMU default 2 if 64BIT range 2 10 if 64BIT default 1 if !64BIT diff --git a/arch/um/Makefile b/arch/um/Makefile index 721b652ffb6584..e0bd7983381d21 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -46,6 +46,11 @@ ARCH_INCLUDE := -I$(srctree)/$(SHARED_HEADERS) ARCH_INCLUDE += -I$(srctree)/$(HOST_DIR)/um/shared KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/um +ifneq ($(CONFIG_MMU),y) +core-y += $(ARCH_DIR)/nommu/ +KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/um/nommu +endif + # -Dstrrchr=kernel_strrchr (as well as the various in6addr symbols) prevents # anything from referencing # libc symbols with the same name, which can cause a linker error. @@ -147,6 +152,12 @@ export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) $(CC_FLAGS_ CLEAN_FILES += linux x.i gmon.out MRPROPER_FILES += $(HOST_DIR)/include/generated +ifeq ($(CONFIG_MMU),y) +UTS_MACHINE := "um" +else +UTS_MACHINE := "um\(nommu\)" +endif + archclean: @find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ -o -name '*.gcov' \) -type f -print | xargs rm -f diff --git a/arch/um/configs/x86_64_nommu_defconfig b/arch/um/configs/x86_64_nommu_defconfig new file mode 100644 index 00000000000000..02cb87091c9f91 --- /dev/null +++ b/arch/um/configs/x86_64_nommu_defconfig @@ -0,0 +1,54 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUPS=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +# CONFIG_PID_NS is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_MMU is not set +CONFIG_HOSTFS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_SSL=y +CONFIG_NULL_CHAN=y +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_CON_CHAN="pts" +CONFIG_SSL_CHAN="pts" +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_IOSCHED_BFQ=m +CONFIG_BINFMT_MISC=m +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_BLK_DEV_UBD=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_DUMMY=m +CONFIG_TUN=m +CONFIG_PPP=m +CONFIG_SLIP=m +CONFIG_LEGACY_PTY_COUNT=32 +CONFIG_UML_RANDOM=y +CONFIG_EXT4_FS=y +CONFIG_QUOTA=y +CONFIG_AUTOFS_FS=m +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_NLS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_FRAME_WARN=1024 +CONFIG_IPV6=y diff --git a/arch/um/include/asm/futex.h b/arch/um/include/asm/futex.h index 780aa6bfc050c2..785fd6649aa26f 100644 --- a/arch/um/include/asm/futex.h +++ b/arch/um/include/asm/futex.h @@ -7,8 +7,12 @@ #include +#ifdef CONFIG_MMU int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr); int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval); +#else +#include +#endif #endif diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index 07d48738b402b4..c0b9ce3215c431 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -21,6 +21,14 @@ typedef struct mm_context { spinlock_t sync_tlb_lock; unsigned long sync_tlb_range_from; unsigned long sync_tlb_range_to; + +#ifndef CONFIG_MMU + unsigned long end_brk; +#ifdef CONFIG_BINFMT_ELF_FDPIC + unsigned long exec_fdpic_loadmap; + unsigned long interp_fdpic_loadmap; +#endif +#endif /* !CONFIG_MMU */ } mm_context_t; #define INIT_MM_CONTEXT(mm) \ diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index c727e56ba116ce..528b217da285b0 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -18,11 +18,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, { } +#ifdef CONFIG_MMU #define init_new_context init_new_context extern int init_new_context(struct task_struct *task, struct mm_struct *mm); #define destroy_context destroy_context extern void destroy_context(struct mm_struct *mm); +#endif #include diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h index 86d74f9d33cf2d..5aa38fe6b2fbe7 100644 --- a/arch/um/include/asm/ptrace-generic.h +++ b/arch/um/include/asm/ptrace-generic.h @@ -14,7 +14,7 @@ struct pt_regs { struct uml_pt_regs regs; }; -#define arch_has_single_step() (1) +#define arch_has_single_step() (IS_ENABLED(CONFIG_MMU)) #define EMPTY_REGS { .regs = EMPTY_UML_PT_REGS } @@ -29,6 +29,12 @@ struct pt_regs { #define PTRACE_OLDSETOPTIONS 21 +#ifdef CONFIG_BINFMT_ELF_FDPIC +#define PTRACE_GETFDPIC 31 +#define PTRACE_GETFDPIC_EXEC 0 +#define PTRACE_GETFDPIC_INTERP 1 +#endif + struct task_struct; extern long subarch_ptrace(struct task_struct *child, long request, diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 0df9ea4abda830..031b357800b785 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -18,6 +18,7 @@ #define __addr_range_nowrap(addr, size) \ ((unsigned long) (addr) <= ((unsigned long) (addr) + (size))) +#ifdef CONFIG_MMU extern unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n); extern unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __clear_user(void __user *mem, unsigned long len); @@ -29,9 +30,6 @@ static inline int __access_ok(const void __user *ptr, unsigned long size); #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER - -#include - static inline int __access_ok(const void __user *ptr, unsigned long size) { unsigned long addr = (unsigned long)ptr; @@ -63,5 +61,8 @@ do { \ barrier(); \ current->thread.segv_continue = NULL; \ } while (0) +#endif + +#include #endif diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 38321188c04c4c..46c8d6336ca147 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -63,9 +63,15 @@ extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, void *mc); extern void fatal_sigsegv(void) __attribute__ ((noreturn)); +extern void sigsys_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc); void um_idle_sleep(void); void kasan_map_memory(void *start, size_t len); +#ifndef CONFIG_MMU +extern void nommu_relay_signal(void *ptr); +#endif + #endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index b26e94292fc14d..be1a527f0228ce 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -189,6 +189,7 @@ extern void check_host_supports_tls(int *supports_tls, int *tls_min); extern void get_host_cpu_features( void (*flags_helper_func)(char *line), void (*cache_helper_func)(char *line)); +extern int host_has_fsgsbase; /* mem.c */ extern int create_mem_file(unsigned long long len); @@ -213,6 +214,11 @@ extern int os_protect_memory(void *addr, unsigned long len, extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); +extern int os_arch_prctl(int pid, int option, unsigned long *arg); +#ifndef CONFIG_MMU +extern long long host_fs; +#endif + void os_set_pdeathsig(void); @@ -356,4 +362,17 @@ static inline void os_local_ipi_enable(void) { } static inline void os_local_ipi_disable(void) { } #endif /* CONFIG_SMP */ +/* seccomp.c */ +#ifdef CONFIG_MMU +static inline int os_setup_seccomp(void) +{ + return 0; +} +#else +extern int os_setup_seccomp(void); + +/* zpoline.c */ +extern int um_zpoline_enabled; +#endif + #endif diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index be60bc451b3f9b..76d36751973ee4 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -16,9 +16,10 @@ always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ - signal.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o kmsg_dump.o capflags.o skas/ + signal.o sysrq.o time.o \ + um_arch.o umid.o kmsg_dump.o capflags.o obj-y += load_file.o +obj-$(CONFIG_MMU) += mem-pgtable.o tlb.o trap.o skas/ obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o diff --git a/arch/um/kernel/mem-pgtable.c b/arch/um/kernel/mem-pgtable.c new file mode 100644 index 00000000000000..549da1d3bff083 --- /dev/null +++ b/arch/um/kernel/mem-pgtable.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Allocate and free page tables. */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + } + return pgd; +} + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 89c8c8b94a7967..2c728fe3de0b4d 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -72,7 +71,8 @@ void __init arch_mm_preinit(void) * to be turned on. */ brk_end = PAGE_ALIGN((unsigned long) sbrk(0)); - map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); + map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, + !IS_ENABLED(CONFIG_MMU)); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; min_low_pfn = PFN_UP(__pa(uml_reserved)); @@ -107,45 +107,11 @@ void free_initmem(void) { } -/* Allocate and free page tables. */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = __pgd_alloc(mm, 0); - - if (pgd) - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - - return pgd; -} - void *uml_kmalloc(int size, int flags) { return kmalloc(size, flags); } -static const pgprot_t protection_map[16] = { - [VM_NONE] = PAGE_NONE, - [VM_READ] = PAGE_READONLY, - [VM_WRITE] = PAGE_COPY, - [VM_WRITE | VM_READ] = PAGE_COPY, - [VM_EXEC] = PAGE_READONLY, - [VM_EXEC | VM_READ] = PAGE_READONLY, - [VM_EXEC | VM_WRITE] = PAGE_COPY, - [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, - [VM_SHARED] = PAGE_NONE, - [VM_SHARED | VM_READ] = PAGE_READONLY, - [VM_SHARED | VM_WRITE] = PAGE_SHARED, - [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, - [VM_SHARED | VM_EXEC] = PAGE_READONLY, - [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, - [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, - [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED -}; -DECLARE_VM_GET_PAGE_PROT - void mark_rodata_ro(void) { unsigned long rodata_start = PFN_ALIGN(__start_rodata); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 63b38a3f73f78d..b07c1f120910e0 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -307,3 +308,40 @@ unsigned long __get_wchan(struct task_struct *p) return 0; } + +extern void start_kernel(void); + +static int __init start_kernel_proc(void *unused) +{ + block_signals_trace(); + + start_kernel(); + return 0; +} + +char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); + +int __init start_uml(void) +{ + stack_protections((unsigned long) &cpu_irqstacks[0]); + set_sigstack(cpu_irqstacks[0], THREAD_SIZE); + + init_new_thread_signals(); + + init_task.thread.request.thread.proc = start_kernel_proc; + init_task.thread.request.thread.arg = NULL; + return start_idle_thread(task_stack_page(&init_task), + &init_task.thread.switch_buf); +} + +static DEFINE_SPINLOCK(initial_jmpbuf_spinlock); + +void initial_jmpbuf_lock(void) +{ + spin_lock_irq(&initial_jmpbuf_spinlock); +} + +void initial_jmpbuf_unlock(void) +{ + spin_unlock_irq(&initial_jmpbuf_spinlock); +} diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 4a7673b0261a88..d643854942bc49 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -17,31 +17,6 @@ #include #include -extern void start_kernel(void); - -static int __init start_kernel_proc(void *unused) -{ - block_signals_trace(); - - start_kernel(); - return 0; -} - -char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); - -int __init start_uml(void) -{ - stack_protections((unsigned long) &cpu_irqstacks[0]); - set_sigstack(cpu_irqstacks[0], THREAD_SIZE); - - init_new_thread_signals(); - - init_task.thread.request.thread.proc = start_kernel_proc; - init_task.thread.request.thread.arg = NULL; - return start_idle_thread(task_stack_page(&init_task), - &init_task.thread.switch_buf); -} - unsigned long current_stub_stack(void) { if (current->mm == NULL) @@ -65,15 +40,3 @@ void current_mm_sync(void) um_tlb_sync(current->mm); } - -static DEFINE_SPINLOCK(initial_jmpbuf_spinlock); - -void initial_jmpbuf_lock(void) -{ - spin_lock_irq(&initial_jmpbuf_spinlock); -} - -void initial_jmpbuf_unlock(void) -{ - spin_unlock_irq(&initial_jmpbuf_spinlock); -} diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index e2b24e1ecfa649..27c13423d9aa80 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -423,6 +423,9 @@ void __init setup_arch(char **cmdline_p) add_bootloader_randomness(rng_seed, sizeof(rng_seed)); memzero_explicit(rng_seed, sizeof(rng_seed)); } + + /* install seccomp filter */ + os_setup_seccomp(); } void __init arch_cpu_finalize_init(void) diff --git a/arch/um/nommu/Makefile b/arch/um/nommu/Makefile new file mode 100644 index 00000000000000..096221590cfdd9 --- /dev/null +++ b/arch/um/nommu/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := trap.o os-Linux/ diff --git a/arch/um/nommu/os-Linux/Makefile b/arch/um/nommu/os-Linux/Makefile new file mode 100644 index 00000000000000..805e26ccf63b0a --- /dev/null +++ b/arch/um/nommu/os-Linux/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := seccomp.o signal.o +USER_OBJS := $(obj-y) + +include $(srctree)/arch/um/scripts/Makefile.rules +USER_CFLAGS+=-I$(srctree)/arch/um/os-Linux diff --git a/arch/um/nommu/os-Linux/seccomp.c b/arch/um/nommu/os-Linux/seccomp.c new file mode 100644 index 00000000000000..d1cfa6e3d63222 --- /dev/null +++ b/arch/um/nommu/os-Linux/seccomp.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include + +int __init os_setup_seccomp(void) +{ + int err; + unsigned long __userspace_start = uml_reserved, + __userspace_end = high_physmem; + + struct sock_filter filter[] = { + /* if (IP_high > __userspace_end) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, __userspace_end >> 32, + /*true-skip=*/0, /*false-skip=*/1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high == __userspace_end && IP_low >= __userspace_end) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __userspace_end >> 32, + /*true-skip=*/0, /*false-skip=*/3), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer)), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_end, + /*true-skip=*/0, /*false-skip=*/1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high < __userspace_start) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_start >> 32, + /*true-skip=*/1, /*false-skip=*/0), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high == __userspace_start && IP_low < __userspace_start) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __userspace_start >> 32, + /*true-skip=*/0, /*false-skip=*/3), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer)), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_start, + /*true-skip=*/1, /*false-skip=*/0), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* other address; trap */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP), + }; + struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + + err = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (err) + os_warn("PR_SET_NO_NEW_PRIVS (err=%d, ernro=%d)\n", + err, errno); + + err = syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog); + if (err) { + os_warn("SECCOMP_SET_MODE_FILTER (err=%d, ernro=%d)\n", + err, errno); + exit(1); + } + + set_handler(SIGSYS); + + os_info("seccomp: setup filter syscalls in the range: 0x%lx-0x%lx\n", + __userspace_start, __userspace_end); + + return 0; +} + diff --git a/arch/um/nommu/os-Linux/signal.c b/arch/um/nommu/os-Linux/signal.c new file mode 100644 index 00000000000000..6febb178dcdacb --- /dev/null +++ b/arch/um/nommu/os-Linux/signal.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +void sigsys_handler(int sig, struct siginfo *si, + struct uml_pt_regs *regs, void *ptr) +{ + mcontext_t *mc = (mcontext_t *) ptr; + + /* hook syscall via SIGSYS */ + set_mc_sigsys_hook(mc); +} + +void nommu_relay_signal(void *ptr) +{ + mcontext_t *mc = (mcontext_t *) ptr; + + set_mc_relay_signal(mc); +} diff --git a/arch/um/nommu/trap.c b/arch/um/nommu/trap.c new file mode 100644 index 00000000000000..430297517455c7 --- /dev/null +++ b/arch/um/nommu/trap.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by + * segv(). + */ +int handle_page_fault(unsigned long address, unsigned long ip, + int is_write, int is_user, int *code_out) +{ + /* !MMU has no pagefault */ + return -EFAULT; +} + +static void show_segv_info(struct uml_pt_regs *regs) +{ + struct task_struct *tsk = current; + struct faultinfo *fi = UPT_FAULTINFO(regs); + + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + pr_warn_ratelimited("%s%s[%d]: segfault at %lx ip %p sp %p error %x", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), + (void *)UPT_IP(regs), (void *)UPT_SP(regs), + fi->error_code); +} + +static void bad_segv(struct faultinfo fi, unsigned long ip) +{ + current->thread.arch.faultinfo = fi; + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi)); +} + +void fatal_sigsegv(void) +{ + force_fatal_sig(SIGSEGV); + do_signal(¤t->thread.regs); + /* + * This is to tell gcc that we're not returning - do_signal + * can, in general, return, but in this case, it's not, since + * we just got a fatal SIGSEGV queued. + */ + os_dump_core(); +} + +/** + * segv_handler() - the SIGSEGV handler + * @sig: the signal number + * @unused_si: the signal info struct; unused in this handler + * @regs: the ptrace register information + * + * The handler first extracts the faultinfo from the UML ptrace regs struct. + * If the userfault did not happen in an UML userspace process, bad_segv is called. + * Otherwise the signal did happen in a cloned userspace process, handle it. + */ +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) +{ + struct faultinfo *fi = UPT_FAULTINFO(regs); + + /* !MMU specific part; detection of userspace */ + /* mark is_user=1 when the IP is from userspace code. */ + if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) < high_physmem) + regs->is_user = 1; + + if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) { + show_segv_info(regs); + bad_segv(*fi, UPT_IP(regs)); + return; + } + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); + + /* !MMU specific part; detection of userspace */ + relay_signal(sig, unused_si, regs, mc); +} + +/* + * We give a *copy* of the faultinfo in the regs to segv. + * This must be done, since nesting SEGVs could overwrite + * the info in the regs. A pointer to the info then would + * give us bad data! + */ +unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, + struct uml_pt_regs *regs, void *mc) +{ + int si_code; + int err; + int is_write = FAULT_WRITE(fi); + unsigned long address = FAULT_ADDRESS(fi); + + if (!is_user && regs) + current->thread.segv_regs = container_of(regs, struct pt_regs, regs); + + if (current->mm == NULL) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with no mm"); + } else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx", + address, ip); + } + + if (SEGV_IS_FIXABLE(&fi)) + err = handle_page_fault(address, ip, is_write, is_user, + &si_code); + else { + err = -EFAULT; + /* + * A thread accessed NULL, we get a fault, but CR2 is invalid. + * This code is used in __do_copy_from_user() of TT mode. + * XXX tt mode is gone, so maybe this isn't needed any more + */ + address = 0; + } + + if (!err) + goto out; + else if (!is_user && arch_fixup(ip, regs)) + goto out; + + if (!is_user) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", + address, ip); + } + + show_segv_info(regs); + + if (err == -EACCES) { + current->thread.arch.faultinfo = fi; + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); + } else { + WARN_ON_ONCE(err != -EFAULT); + current->thread.arch.faultinfo = fi; + force_sig_fault(SIGSEGV, si_code, (void __user *) address); + } + +out: + if (regs) + current->thread.segv_regs = NULL; + + return 0; +} + +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) +{ + int code, err; + + /* !MMU specific part; detection of userspace */ + /* mark is_user=1 when the IP is from userspace code. */ + if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) < high_physmem) + regs->is_user = 1; + + if (!UPT_IS_USER(regs)) { + if (sig == SIGBUS) + pr_err("Bus error - the host /dev/shm or /tmp mount likely just ran out of space\n"); + panic("Kernel mode signal %d", sig); + } + /* if is_user==1, set return to userspace sig handler to relay signal */ + nommu_relay_signal(mc); + + arch_examine_signal(sig, regs); + + /* Is the signal layout for the signal known? + * Signal data must be scrubbed to prevent information leaks. + */ + code = si->si_code; + err = si->si_errno; + if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) { + struct faultinfo *fi = UPT_FAULTINFO(regs); + + current->thread.arch.faultinfo = *fi; + force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi)); + } else { + pr_err("Attempted to relay unknown signal %d (si_code = %d) with errno %d\n", + sig, code, err); + force_sig(sig); + } +} + +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) +{ + do_IRQ(WINCH_IRQ, regs); +} diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index f8d672d570d9b6..40e3e0eab6a056 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -8,7 +8,8 @@ KCOV_INSTRUMENT := n obj-y = elf_aux.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ - umid.o user_syms.o util.o skas/ + umid.o user_syms.o util.o +obj-$(CONFIG_MMU) += skas/ CFLAGS_signal.o += -Wframe-larger-than=4096 diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index bac9fcc8c14c5a..25cb5cc931c143 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -6,6 +6,14 @@ #include #include +/* NOMMU doesn't work with thread-local storage used in CONFIG_SMP, + * due to the dependency on host_fs variable switch upon user/kernel + * context so, disable TLS until NOMMU supports SMP. + */ +#ifndef CONFIG_MMU +#define __thread +#endif + /* * elf_aux.c */ diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 72f302f4d197f8..4f5d9a94f8e200 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -213,6 +213,10 @@ int __init create_mem_file(unsigned long long len) { int err, fd; + /* NOMMU kernel uses -1 as a fd for further use (e.g., mmap) */ + if (!IS_ENABLED(CONFIG_MMU)) + return -1; + fd = create_tmp_file(len); err = os_set_exec_close(fd); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 3a2a84ab93257d..4f30a389c5304d 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -17,10 +18,17 @@ #include #include #include +#include /* For SYS_xxx definitions */ +#include #include #include #include #include +#include +#include + +int using_seccomp; +int unscheduled_userspace_iterations; void os_alarm_process(int pid) { @@ -93,8 +101,8 @@ int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len, prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | (x ? PROT_EXEC : 0); - loc = mmap64((void *) virt, len, prot, MAP_SHARED | MAP_FIXED, - fd, off); + loc = mmap64((void *) virt, len, prot, MAP_SHARED | MAP_FIXED | + (!IS_ENABLED(CONFIG_MMU) ? MAP_ANONYMOUS : 0), fd, off); if (loc == MAP_FAILED) return -errno; return 0; @@ -172,6 +180,11 @@ int __init can_drop_memory(void) return ok; } +int os_arch_prctl(int pid, int option, unsigned long *arg2) +{ + return syscall(SYS_arch_prctl, option, arg2); +} + void init_new_thread_signals(void) { set_handler(SIGSEGV); @@ -209,3 +222,125 @@ int os_futex_wake(void *uaddr) NULL, NULL, 0)); return r < 0 ? -errno : r; } + +int is_skas_winch(int pid, int fd, void *data) +{ + return pid == getpgrp(); +} + +void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) +{ + (*buf)[0].JB_IP = (unsigned long) handler; + (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - + sizeof(void *); +} + +#define INIT_JMP_NEW_THREAD 0 +#define INIT_JMP_CALLBACK 1 +#define INIT_JMP_HALT 2 +#define INIT_JMP_REBOOT 3 + +void switch_threads(jmp_buf *me, jmp_buf *you) +{ + unscheduled_userspace_iterations = 0; + + if (UML_SETJMP(me) == 0) + UML_LONGJMP(you, 1); +} + +static jmp_buf initial_jmpbuf; + +static __thread void (*cb_proc)(void *arg); +static __thread void *cb_arg; +static __thread jmp_buf *cb_back; + +int start_idle_thread(void *stack, jmp_buf *switch_buf) +{ + int n; + + set_handler(SIGWINCH); + + /* + * Can't use UML_SETJMP or UML_LONGJMP here because they save + * and restore signals, with the possible side-effect of + * trying to handle any signals which came when they were + * blocked, which can't be done on this stack. + * Signals must be blocked when jumping back here and restored + * after returning to the jumper. + */ + n = setjmp(initial_jmpbuf); + switch (n) { + case INIT_JMP_NEW_THREAD: + (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; + (*switch_buf)[0].JB_SP = (unsigned long) stack + + UM_THREAD_SIZE - sizeof(void *); + break; + case INIT_JMP_CALLBACK: + (*cb_proc)(cb_arg); + longjmp(*cb_back, 1); + break; + case INIT_JMP_HALT: + kmalloc_ok = 0; + return 0; + case INIT_JMP_REBOOT: + kmalloc_ok = 0; + return 1; + default: + printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", + __func__, n); + fatal_sigsegv(); + } + longjmp(*switch_buf, 1); + + /* unreachable */ + printk(UM_KERN_ERR "impossible long jump!"); + fatal_sigsegv(); + return 0; +} + +void initial_thread_cb_skas(void (*proc)(void *), void *arg) +{ + jmp_buf here; + + cb_proc = proc; + cb_arg = arg; + cb_back = &here; + + initial_jmpbuf_lock(); + if (UML_SETJMP(&here) == 0) + UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); + initial_jmpbuf_unlock(); + + cb_proc = NULL; + cb_arg = NULL; + cb_back = NULL; +} + +void halt_skas(void) +{ + initial_jmpbuf_lock(); + UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); + /* unreachable */ +} + +static bool noreboot; + +static int __init noreboot_cmd_param(char *str, int *add) +{ + *add = 0; + noreboot = true; + return 0; +} + +__uml_setup("noreboot", noreboot_cmd_param, +"noreboot\n" +" Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" +" This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" +" crashes in CI\n\n"); + +void reboot_skas(void) +{ + initial_jmpbuf_lock(); + UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); + /* unreachable */ +} diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 6c993bc8c78eb2..52a525b0e47817 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = { @@ -32,6 +33,7 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, [SIGCHLD] = sigchld_handler, + [SIGSYS] = sigsys_handler, }; static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) @@ -39,9 +41,10 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) struct uml_pt_regs r; r.is_user = 0; + if (mc) + get_regs_from_mc(&r, mc); if (sig == SIGSEGV) { /* For segfaults, we want the data from the sigcontext. */ - get_regs_from_mc(&r, mc); GET_FAULTINFO_FROM_MC(r.faultinfo, mc); } @@ -180,6 +183,11 @@ static void sigusr1_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) uml_pm_wake(); } +__weak void sigsys_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) +{ +} + void register_pm_wake_signal(void) { set_handler(SIGUSR1); @@ -191,6 +199,7 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGILL] = sig_handler, [SIGFPE] = sig_handler, [SIGTRAP] = sig_handler, + [SIGSYS] = sig_handler, [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index d6c22f8aa06d1c..0ebde3d488d668 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -29,16 +28,10 @@ #include #include #include -#include #include #include #include "../internal.h" -int is_skas_winch(int pid, int fd, void *data) -{ - return pid == getpgrp(); -} - static const char *ptrace_reg_name(int idx) { #define R(n) case HOST_##n: return #n @@ -426,8 +419,6 @@ static int __init init_stub_exe_fd(void) } __initcall(init_stub_exe_fd); -int using_seccomp; - /** * start_userspace() - prepare a new userspace process * @mm_id: The corresponding struct mm_id @@ -540,7 +531,7 @@ int start_userspace(struct mm_id *mm_id) return err; } -static int unscheduled_userspace_iterations; +extern int unscheduled_userspace_iterations; extern unsigned long tt_extra_sched_jiffies; void userspace(struct uml_pt_regs *regs) @@ -789,120 +780,3 @@ void userspace(struct uml_pt_regs *regs) } } } - -void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) -{ - (*buf)[0].JB_IP = (unsigned long) handler; - (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - - sizeof(void *); -} - -#define INIT_JMP_NEW_THREAD 0 -#define INIT_JMP_CALLBACK 1 -#define INIT_JMP_HALT 2 -#define INIT_JMP_REBOOT 3 - -void switch_threads(jmp_buf *me, jmp_buf *you) -{ - unscheduled_userspace_iterations = 0; - - if (UML_SETJMP(me) == 0) - UML_LONGJMP(you, 1); -} - -static jmp_buf initial_jmpbuf; - -static __thread void (*cb_proc)(void *arg); -static __thread void *cb_arg; -static __thread jmp_buf *cb_back; - -int start_idle_thread(void *stack, jmp_buf *switch_buf) -{ - int n; - - set_handler(SIGWINCH); - - /* - * Can't use UML_SETJMP or UML_LONGJMP here because they save - * and restore signals, with the possible side-effect of - * trying to handle any signals which came when they were - * blocked, which can't be done on this stack. - * Signals must be blocked when jumping back here and restored - * after returning to the jumper. - */ - n = setjmp(initial_jmpbuf); - switch (n) { - case INIT_JMP_NEW_THREAD: - (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; - (*switch_buf)[0].JB_SP = (unsigned long) stack + - UM_THREAD_SIZE - sizeof(void *); - break; - case INIT_JMP_CALLBACK: - (*cb_proc)(cb_arg); - longjmp(*cb_back, 1); - break; - case INIT_JMP_HALT: - kmalloc_ok = 0; - return 0; - case INIT_JMP_REBOOT: - kmalloc_ok = 0; - return 1; - default: - printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", - __func__, n); - fatal_sigsegv(); - } - longjmp(*switch_buf, 1); - - /* unreachable */ - printk(UM_KERN_ERR "impossible long jump!"); - fatal_sigsegv(); - return 0; -} - -void initial_thread_cb_skas(void (*proc)(void *), void *arg) -{ - jmp_buf here; - - cb_proc = proc; - cb_arg = arg; - cb_back = &here; - - initial_jmpbuf_lock(); - if (UML_SETJMP(&here) == 0) - UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); - initial_jmpbuf_unlock(); - - cb_proc = NULL; - cb_arg = NULL; - cb_back = NULL; -} - -void halt_skas(void) -{ - initial_jmpbuf_lock(); - UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); - /* unreachable */ -} - -static bool noreboot; - -static int __init noreboot_cmd_param(char *str, int *add) -{ - *add = 0; - noreboot = true; - return 0; -} - -__uml_setup("noreboot", noreboot_cmd_param, -"noreboot\n" -" Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" -" This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" -" crashes in CI\n\n"); - -void reboot_skas(void) -{ - initial_jmpbuf_lock(); - UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); - /* unreachable */ -} diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 054ac03bbf5eeb..c0afe5d8b5598d 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include #include @@ -37,6 +39,8 @@ #include #include "internal.h" +int host_has_fsgsbase; + static void ptrace_child(void) { int ret; @@ -239,7 +243,7 @@ extern unsigned long *exec_fp_regs; __initdata static struct stub_data *seccomp_test_stub_data; -static void __init sigsys_handler(int sig, siginfo_t *info, void *p) +static void __init _sigsys_handler(int sig, siginfo_t *info, void *p) { ucontext_t *uc = p; @@ -274,7 +278,7 @@ static int __init seccomp_helper(void *data) sizeof(seccomp_test_stub_data->sigstack)); sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; - sa.sa_sigaction = (void *) sigsys_handler; + sa.sa_sigaction = (void *) _sigsys_handler; sa.sa_restorer = NULL; if (sigaction(SIGSYS, &sa, NULL) < 0) exit(2); @@ -460,6 +464,20 @@ __uml_setup("seccomp=", uml_seccomp_config, " This is insecure and should only be used with a trusted userspace\n\n" ); +static void __init check_fsgsbase(void) +{ + unsigned long auxv = getauxval(AT_HWCAP2); + + os_info("Checking FSGSBASE instructions..."); + if (auxv & HWCAP2_FSGSBASE) { + host_has_fsgsbase = 1; + os_info("OK\n"); + } else { + host_has_fsgsbase = 0; + os_info("disabled\n"); + } +} + void __init os_early_checks(void) { int pid; @@ -488,6 +506,9 @@ void __init os_early_checks(void) using_seccomp = 0; check_ptrace(); + /* probe fsgsbase instruction */ + check_fsgsbase(); + pid = start_ptraced_child(); if (init_pid_registers(pid)) fatal("Failed to initialize default registers"); diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index e3ad71a0d13c41..5fb26f5dfcb629 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -64,7 +64,8 @@ void setup_machinename(char *machine_out) } # endif #endif - strcpy(machine_out, host.machine); + strcat(machine_out, "/"); + strcat(machine_out, host.machine); } void setup_hostinfo(char *buf, int len) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 44b12e45f9a058..329f75b3df9407 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -13,7 +13,7 @@ config UML_X86 select ARCH_USE_QUEUED_SPINLOCKS select DCACHE_WORD_ACCESS select HAVE_EFFICIENT_UNALIGNED_ACCESS - select UML_SUBARCH_SUPPORTS_SMP if X86_CX8 + select UML_SUBARCH_SUPPORTS_SMP if X86_CX8 && MMU config 64BIT bool "64-bit kernel" if "$(SUBARCH)" = "x86" diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index f9ea75bf43aca3..8b4780968ec58d 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -15,6 +15,8 @@ obj-y = bugs_$(BITS).o delay.o fault.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ subarch.o os-Linux/ +obj-y += tests/ + ifeq ($(CONFIG_X86_32),y) obj-y += syscalls_32.o @@ -31,6 +33,10 @@ obj-y += mem_64.o syscalls_64.o vdso/ subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \ ../lib/memmove_64.o ../lib/memset_64.o +ifneq ($(CONFIG_MMU),y) +obj-y += nommu/ +endif + endif subarch-$(CONFIG_MODULES) += ../kernel/module.o diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 22d0111b543b34..ca389d5213234b 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -9,6 +9,7 @@ #include #define CORE_DUMP_USE_REGSET +#define ELF_FDPIC_CORE_EFLAGS 0 #ifdef CONFIG_X86_32 @@ -155,11 +156,17 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); +struct elf_fdpic_params; +extern int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params); extern unsigned long um_vdso_addr; #define AT_SYSINFO_EHDR 33 -#define ARCH_DLINFO NEW_AUX_ENT(AT_SYSINFO_EHDR, um_vdso_addr) - +#define ARCH_DLINFO \ +do { \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, um_vdso_addr); \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, 0); \ +} while (0) #endif typedef unsigned long elf_greg_t; diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index d6208d0fad51a9..bb4f6f01166796 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -20,4 +20,10 @@ static inline int syscall_get_arch(struct task_struct *task) #endif } +#ifndef CONFIG_MMU +extern void do_syscall_64(struct pt_regs *regs); +extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6); +#endif + #endif /* __UM_ASM_SYSCALL_H */ diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile new file mode 100644 index 00000000000000..98b719d354bb3b --- /dev/null +++ b/arch/x86/um/nommu/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +ifeq ($(CONFIG_X86_32),y) + BITS := 32 +else + BITS := 64 +endif + +obj-y = do_syscall_$(BITS).o entry_$(BITS).o syscalls_$(BITS).o os-Linux/ +obj-y += zpoline.o + +# used by zpoline.c to translate syscall/sysenter instructions +# note: only in x86_64 w/ !CONFIG_MMU +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +targets += inat-tables.c +$(obj)/../../lib/inat.o: $(obj)/inat-tables.c +obj-y += ../../lib/insn.o ../../lib/inat.o diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c new file mode 100644 index 00000000000000..7756c30f183c65 --- /dev/null +++ b/arch/x86/um/nommu/do_syscall_64.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include "syscalls.h" + +__visible void do_syscall_64(struct pt_regs *regs) +{ + int syscall; + + syscall = PT_SYSCALL_NR(regs->regs.gp); + UPT_SYSCALL_NR(®s->regs) = syscall; + + /* set fs register to the original host one */ + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); + + /* save fp registers */ + asm volatile("fxsaveq %0" : "=m"(*(struct _xstate *)regs->regs.fp)); + + if (likely(syscall < NR_syscalls)) { + unsigned long ret; + + ret = (*sys_call_table[syscall])(UPT_SYSCALL_ARG1(®s->regs), + UPT_SYSCALL_ARG2(®s->regs), + UPT_SYSCALL_ARG3(®s->regs), + UPT_SYSCALL_ARG4(®s->regs), + UPT_SYSCALL_ARG5(®s->regs), + UPT_SYSCALL_ARG6(®s->regs)); + PT_REGS_SET_SYSCALL_RETURN(regs, ret); + } + + PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX]; + + /* handle tasks and signals at the end */ + interrupt_end(); + + /* restore fp registers */ + asm volatile("fxrstorq %0" : : "m"((current->thread.regs.regs.fp))); + + /* restore back fs register to userspace configured one */ + os_x86_arch_prctl(0, ARCH_SET_FS, + (void *)(current->thread.regs.regs.gp[FS_BASE + / sizeof(unsigned long)])); + +} diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S new file mode 100644 index 00000000000000..a58922fc81e5ef --- /dev/null +++ b/arch/x86/um/nommu/entry_64.S @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +#include +#include +#include + +#include "../entry/calling.h" + +#ifdef CONFIG_SMP +#error need to stash these variables somewhere else +#endif + +#define UM_GLOBAL_VAR(x) .data; .align 8; .globl x; x:; .long 0 + +UM_GLOBAL_VAR(current_top_of_stack) +UM_GLOBAL_VAR(current_ptregs) + +.code64 +.section .entry.text, "ax" + +.align 8 +#undef ENTRY +#define ENTRY(x) .text; .globl x; .type x,%function; x: +#undef END +#define END(x) .size x, . - x + +/* + * %rcx has the return address (we set it before entering __kernel_vsyscall). + * + * Registers on entry: + * rax system call number + * rcx return address + * rdi arg0 + * rsi arg1 + * rdx arg2 + * r10 arg3 + * r8 arg4 + * r9 arg5 + * + * (note: we are allowed to mess with r11: r11 is callee-clobbered + * register in C ABI) + */ +ENTRY(__kernel_vsyscall) + + movq %rsp, %r11 + + /* Point rsp to the top of the ptregs array, so we can + just fill it with a bunch of push'es. */ + movq current_ptregs, %rsp + + /* 8 bytes * 20 registers (plus 8 for the push) */ + addq $168, %rsp + + /* Construct struct pt_regs on stack */ + pushq $0 /* pt_regs->ss (index 20) */ + pushq %r11 /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $0 /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + + PUSH_AND_CLEAR_REGS rax=$-ENOSYS + + mov %rsp, %rdi + + /* + * Switch to current top of stack, so "current->" points + * to the right task. + */ + movq current_top_of_stack, %rsp + + call do_syscall_64 + + jmp userspace + +END(__kernel_vsyscall) + +/* + * common userspace returning routine + * + * all procedures like syscalls, signal handlers, umh processes, will gate + * this routine to properly configure registers/stacks. + * + * void userspace(struct uml_pt_regs *regs) + */ +ENTRY(userspace) + + /* set stack and pt_regs to the current task */ + call arch_set_stack_to_current + /* clear direction flag to meet ABI */ + cld + /* align the stack for x86_64 ABI */ + and $-0x10, %rsp + /* Handle any immediate reschedules or signals */ + call interrupt_end + + movq current_ptregs, %rsp + + POP_REGS + + addq $8, %rsp /* skip orig_ax */ + popq %rcx /* pt_regs->ip */ + addq $8, %rsp /* skip cs */ + addq $8, %rsp /* skip flags */ + popq %rsp + + /* + * not return w/ ret but w/ jmp as the stack is already popped before + * entering __kernel_vsyscall + */ + jmp *%rcx + +END(userspace) diff --git a/arch/x86/um/nommu/os-Linux/Makefile b/arch/x86/um/nommu/os-Linux/Makefile new file mode 100644 index 00000000000000..4571e403a6ffa1 --- /dev/null +++ b/arch/x86/um/nommu/os-Linux/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y = mcontext.o +USER_OBJS := mcontext.o + +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/nommu/os-Linux/mcontext.c b/arch/x86/um/nommu/os-Linux/mcontext.c new file mode 100644 index 00000000000000..1f24990ad8abdb --- /dev/null +++ b/arch/x86/um/nommu/os-Linux/mcontext.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#define __FRAME_OFFSETS +#include +#include +#include +#include +#include "../syscalls.h" + +extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6); + +void set_mc_relay_signal(mcontext_t *mc) +{ + /* configure stack and userspace returning routine as + * instruction pointer + */ + mc->gregs[REG_RSP] = (unsigned long) current_top_of_stack; + mc->gregs[REG_RIP] = (unsigned long) userspace; +} + +void set_mc_sigsys_hook(mcontext_t *mc) +{ + os_x86_set_hostfs(); + mc->gregs[REG_RCX] = mc->gregs[REG_RIP]; + mc->gregs[REG_RIP] = (unsigned long) __kernel_vsyscall; +} diff --git a/arch/x86/um/nommu/syscalls.h b/arch/x86/um/nommu/syscalls.h new file mode 100644 index 00000000000000..89c47ddfdada13 --- /dev/null +++ b/arch/x86/um/nommu/syscalls.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_NOMMU_SYSCALLS_H +#define __UM_NOMMU_SYSCALLS_H + + +#define task_top_of_stack(task) \ +({ \ + unsigned long __ptr = (unsigned long)task->stack; \ + __ptr += THREAD_SIZE; \ + __ptr; \ +}) + +extern long current_top_of_stack; +extern long current_ptregs; + +int os_x86_arch_prctl(int pid, int option, unsigned long *arg2); +void arch_set_stack_to_current(void); +void os_x86_set_hostfs(void); + +#endif diff --git a/arch/x86/um/nommu/syscalls_64.c b/arch/x86/um/nommu/syscalls_64.c new file mode 100644 index 00000000000000..9a8d45acb1cc77 --- /dev/null +++ b/arch/x86/um/nommu/syscalls_64.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include /* XXX This should get the constants from libc */ +#include +#include +#include "syscalls.h" + +/* + * The guest libc can change FS, which confuses the host libc. + * In fact, changing FS directly is not supported (check + * man arch_prctl). So, whenever we make a host syscall, + * we should be changing FS to the original FS (not the + * one set by the guest libc). This original FS is stored + * in host_fs. + */ +long long host_fs = -1; + +int os_x86_arch_prctl(int pid, int option, unsigned long *arg2) +{ + if (!host_has_fsgsbase) + return os_arch_prctl(pid, option, arg2); + + switch (option) { + case ARCH_SET_FS: + wrfsbase(*arg2); + break; + case ARCH_SET_GS: + wrgsbase(*arg2); + break; + case ARCH_GET_FS: + *arg2 = rdfsbase(); + break; + case ARCH_GET_GS: + *arg2 = rdgsbase(); + break; + default: + pr_warn("%s: unsupported option: 0x%x", __func__, option); + break; + } + + return 0; +} + +void arch_set_stack_to_current(void) +{ + current_top_of_stack = task_top_of_stack(current); + current_ptregs = (long)task_pt_regs(current); +} + +void arch_switch_to(struct task_struct *to) +{ + /* + * In !CONFIG_MMU, it doesn't ptrace thus, + * The FS_BASE registers are saved here. + */ + current_top_of_stack = task_top_of_stack(to); + current_ptregs = (long)task_pt_regs(to); + + if ((to->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] == 0) || + (to->mm == NULL)) + return; + + /* this changes the FS on every context switch */ + os_x86_arch_prctl(0, ARCH_SET_FS, + (void __user *) to->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)]); +} + +static int __init um_nommu_setup_hostfs(void) +{ + /* initialize the host_fs value at boottime */ + os_x86_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs); + + return 0; +} +arch_initcall(um_nommu_setup_hostfs); + +void os_x86_set_hostfs(void) +{ + if (host_fs == -1) + um_nommu_setup_hostfs(); + + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); +} diff --git a/arch/x86/um/nommu/zpoline.c b/arch/x86/um/nommu/zpoline.c new file mode 100644 index 00000000000000..761a96b7c5c899 --- /dev/null +++ b/arch/x86/um/nommu/zpoline.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * zpoline.c + * + * Replace syscall/sysenter instructions to `call *%rax` to hook syscalls. + * + */ +//#define DEBUG +#include +#include +#include +#include +#include +#include +#include + +int um_zpoline_enabled; +/* start of trampoline code area */ +static char *__zpoline_start; + +static int __zpoline_translate_syscalls(struct elf_fdpic_params *params) +{ + int count = 0, loop; + struct insn insn; + unsigned long addr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; + struct elfhdr *ehdr = (struct elfhdr *)params->elfhdr_addr; + + if (!ehdr) + return 0; + + seg = params->loadmap->segs; + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + addr = seg->addr; + /* skip translation of trampoline code */ + if (addr <= (unsigned long)(&__zpoline_start[0] + 0x1000 + 0x0100)) { + pr_warn("%lx: address is in the range of trampoline", addr); + return -EINVAL; + } + + /* translate only segment with Executable flag */ + if (!(phdr->p_flags & PF_X)) { + seg++; + continue; + } + + pr_debug("translation 0x%lx-0x%llx", addr, + seg->addr + seg->p_memsz); + /* now ready to translate */ + while (addr < (seg->addr + seg->p_memsz)) { + insn_init(&insn, (void *)addr, MAX_INSN_SIZE, 1); + insn_get_length(&insn); + + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + pr_debug("%lx: found syscall/sysenter", addr); + *(char *)addr = 0xff; // callq + *((char *)addr + 1) = 0xd0; // *%rax + count++; + break; + } + default: + break; + } + + addr += insn.length; + if (insn.length == 0) { + pr_debug("%lx: length zero with byte %x. skip ?", + addr, insn.opcode.bytes[0]); + addr += 1; + } + } + seg++; + } + return count; +} + +/** + * elf_arch_finalize_exec() - architecture hook to translate syscall/sysenter + * + * translate syscall/sysenter instruction upon loading ELF binary file + * on execve(2)&co syscall. + * + * suppose we have those instructions: + * + * mov $sysnr, %rax + * syscall 0f 05 + * + * this will translate it with: + * + * mov $sysnr, %rax (<= untouched) + * call *(%rax) ff d0 + * + * this will finally called hook function guided by trampoline code installed + * at setup_zpoline_trampoline(). + * + * @exec_params: ELF meta data for executable file + * @interp_params: ELF meta data for the interpreter file + */ +int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params) +{ + int err = 0, count = 0; + struct mm_struct *mm = current->mm; + + /* zpoline disabled */ + if (!um_zpoline_enabled) + return 0; + + if (down_write_killable(&mm->mmap_lock)) + return -EINTR; + + /* translate for the executable */ + err = __zpoline_translate_syscalls(exec_params); + if (err < 0) { + pr_info("zpoline: xlate error %d", err); + goto out; + } + count += err; + pr_debug("zpoline: rewritten (exec) %d syscalls\n", count); + + /* translate for the interpreter */ + err = __zpoline_translate_syscalls(interp_params); + if (err < 0) { + pr_info("zpoline: xlate error %d", err); + goto out; + } + count += err; + + err = 0; + pr_debug("zpoline: rewritten (exec+interp) %d syscalls\n", count); + +out: + up_write(&mm->mmap_lock); + return err; +} + +/** + * setup_zpoline_trampoline() - install trampoline code for zpoline + * + * setup trampoline code for syscall hooks + * + * the trampoline code guides to call hooked function, __kernel_vsyscall + * in this case, via nop slides at the memory address zero (thus, zpoline). + * + * loaded binary by exec(2) is translated to call the function. + */ +static int __init setup_zpoline_trampoline(void) +{ + int i, ret; + int ptr; + + if (!um_zpoline_enabled) + return 0; + + /* zpoline: map area of trampoline code started from addr 0x0 */ + __zpoline_start = 0x0; + + ret = os_map_memory((void *) 0, -1, 0, PAGE_SIZE, 1, 1, 1); + if (ret) + panic("map failed\n NOTE: /proc/sys/vm/mmap_min_addr should be set 0\n"); + + /* fill nop instructions until the trampoline code */ + for (i = 0; i < NR_syscalls; i++) + __zpoline_start[i] = 0x90; + + /* optimization to skip old syscalls */ + /* short jmp */ + __zpoline_start[214 /* __NR_epoll_ctl_old */] = 0xeb; + /* range of a short jmp : -128 ~ +127 */ + __zpoline_start[215 /* __NR_epoll_wait_old */] = 127; + + /** + * FIXME: shift red zone area to properly handle the case + */ + + /** + * put code for jumping to __kernel_vsyscall. + * + * here we embed the following code. + * + * movabs [$addr],%r11 + * jmpq *%r11 + * + */ + ptr = NR_syscalls; + /* 49 bb [64-bit addr (8-byte)] movabs [64-bit addr (8-byte)],%r11 */ + __zpoline_start[ptr++] = 0x49; + __zpoline_start[ptr++] = 0xbb; + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 0)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 1)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 2)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 3)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 4)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 5)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 6)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 7)); + + /* + * pretending to be syscall instruction by putting return + * address in %rcx. + */ + /* 59 pop %rcx */ + __zpoline_start[ptr++] = 0x59; + + /* 41 ff e3 jmp *%r11 */ + __zpoline_start[ptr++] = 0x41; + __zpoline_start[ptr++] = 0xff; + __zpoline_start[ptr++] = 0xe3; + + /* permission: XOM (PROT_EXEC only) */ + ret = os_protect_memory(0, PAGE_SIZE, 0, 0, 1); + if (ret) + panic("failed: can't configure permission on trampoline code"); + + pr_info("zpoline: setting up trampoline code done\n"); + return 0; +} +arch_initcall(setup_zpoline_trampoline); + +static int __init zpoline_set(char *str) +{ + int val = 0; + + get_option(&str, &val); + um_zpoline_enabled = val; + return 1; +} +__setup("zpoline=", zpoline_set); diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h index 6fe490cc5b98a4..82a5f38b350f50 100644 --- a/arch/x86/um/shared/sysdep/mcontext.h +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -17,6 +17,11 @@ extern int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, int single_stepping); +#ifndef CONFIG_MMU +extern void set_mc_sigsys_hook(mcontext_t *mc); +extern void set_mc_relay_signal(mcontext_t *mc); +#endif + #ifdef __i386__ #define GET_FAULTINFO_FROM_MC(fi, mc) \ diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 572ea2d79131c7..6ed6bb1ca50eaa 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -53,7 +53,7 @@ struct uml_pt_regs { int is_user; /* Dynamically sized FP registers (holds an XSTATE) */ - unsigned long fp[]; + unsigned long fp[] __attribute__((aligned(16))); }; #define EMPTY_UML_PT_REGS { } diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index 6a00a28c9cca77..9684850603a286 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -45,7 +45,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) return arch_prctl(current, option, (unsigned long __user *) arg2); } -void arch_switch_to(struct task_struct *to) +__weak void arch_switch_to(struct task_struct *to) { /* * Nothing needs to be done on x86_64. diff --git a/arch/x86/um/tests/Makefile b/arch/x86/um/tests/Makefile new file mode 100644 index 00000000000000..743a0ff129df55 --- /dev/null +++ b/arch/x86/um/tests/Makefile @@ -0,0 +1,12 @@ +include $(srctree)/init/Makefile.nolibc + +ccflags-y := -I$(obj) + +um-tests-y += registers.o + +userprogs += test-fp-save-restore +test-fp-save-restore-userccflags := $(NOLIBC_USERCFLAGS) -msse + +obj-$(CONFIG_KUNIT_UAPI) += um-tests.o + +$(obj)/registers.o: $(obj)/test-fp-save-restore diff --git a/arch/x86/um/tests/registers.c b/arch/x86/um/tests/registers.c new file mode 100644 index 00000000000000..2c4e55da043c0f --- /dev/null +++ b/arch/x86/um/tests/registers.c @@ -0,0 +1,22 @@ +#include +#include +#include + +static void test_mcontext(struct kunit *test) +{ + KUNIT_UAPI_EMBED_BLOB(test_fp_save_restore, "test-fp-save-restore"); + + kunit_uapi_run_kselftest(test, &test_fp_save_restore); +} + +static struct kunit_case register_test_cases[] = { + KUNIT_CASE(test_mcontext), + {} +}; + +static struct kunit_suite register_test_suite = { + .name = "um_registers", + .test_cases = register_test_cases, +}; + +kunit_test_suites(®ister_test_suite); diff --git a/arch/x86/um/tests/test-fp-save-restore.c b/arch/x86/um/tests/test-fp-save-restore.c new file mode 100644 index 00000000000000..28a32ca374fe64 --- /dev/null +++ b/arch/x86/um/tests/test-fp-save-restore.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test FP register handling in userspace mcontext. + * + * Copyright (C) 2025 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../tools/testing/selftests/kselftest.h" + +#define ST0_EXP_ADD 10 + +static void sighandler(int sig, siginfo_t *info, void *p) +{ + struct ucontext *uc = p; + struct _fpstate *fpstate = (void *)uc->uc_mcontext.fpstate; + + ksft_print_msg("sighandler: extended_size: %d, xstate_size: %d\n", + fpstate->sw_reserved.extended_size, + fpstate->sw_reserved.xstate_size); + +#ifdef __i386__ + fpstate->_st[0].exponent += ST0_EXP_ADD; + fpstate->_xmm[1].element[0] |= 0x01010101; + fpstate->_xmm[1].element[1] |= 0x01010101; + fpstate->_xmm[1].element[2] |= 0x01010101; + fpstate->_xmm[1].element[3] |= 0x01010101; +#else + /* Hacky way of modifying the exponent without breaking aliasing */ + fpstate->st_space[2] += ST0_EXP_ADD; + fpstate->xmm_space[4] |= 0x01010101; + fpstate->xmm_space[5] |= 0x01010101; + fpstate->xmm_space[6] |= 0x01010101; + fpstate->xmm_space[7] |= 0x01010101; +#endif +} + +static int test_mcontext(int xmm_should_change) +{ + double num = 0.5; + uint32_t sse[4] = {0x11223344, 0x55667788, 0x99aabbcc, 0xddeeff00 }; + long ret; + int xmm_manipulated; + + ksft_print_msg("pre-signal: %d / 100, %08x %08x %08x %08x\n", (int) (100*num), sse[0], sse[1], sse[2], sse[3]); + /* + * This does kill(getpid(), SIGUSR1); with "num" being passed in AND + * out of the floating point stack. We can therefore modify num by + * changing st[0] when handling the signal. + */ +#ifdef __i386__ + asm volatile ( + "movups %1, %%xmm1;" + "int $0x80;" + "movups %%xmm1, %1;" + : "=t" (num), "=m" (sse), "=a" (ret) + : "0" (num), "2" (__NR_kill), "b" (getpid()), "c" (SIGUSR1) : + "xmm1", "memory"); +#else + asm volatile ( + "movups %1, %%xmm1;" + "syscall;" + "movups %%xmm1, %1;" + : "=t" (num), "=m"(sse), "=a" (ret) + : "0" (num), "2" (__NR_kill), "D" (getpid()), "S" (SIGUSR1) + : "r11", "rcx", "xmm1", "memory"); +#endif + if (sse[0] == 0x11223344 || sse[1] == 0x55667788 || sse[2] == 0x99aabbcc || sse[3] == 0xddeeff00) + xmm_manipulated = 0; + else if (sse[0] == 0x11233345 || sse[1] == 0x55677789 || sse[2] == 0x99abbbcd || sse[3] == 0xddefff01) + xmm_manipulated = 1; + else + xmm_manipulated = 2; + + ksft_print_msg("post-signal: %d / 100, %08x %08x %08x %08x (should change: %d, changed: %d)\n", + (int) (100 * num), sse[0], sse[1], sse[2], sse[3], xmm_should_change, xmm_manipulated); + + if (num != (1 << (ST0_EXP_ADD - 1))) { + ksft_print_msg("floating point register was not manipulated\n"); + return 1; + } + + if (xmm_manipulated != xmm_should_change) { + ksft_print_msg("xmm/sse had unexpected value!\n"); + return 1; + } + + return 0; +} + +int main(void) +{ + struct sigaction sa = { + .sa_flags = SA_SIGINFO, + .sa_handler = (void (*)(int))sighandler, + }; + + ksft_print_header(); + ksft_set_plan(1); + + if (sys_sigaction(SIGUSR1, &sa, NULL) < 0) + ksft_exit_fail_msg("Failed to register sigaction: %d\n", errno); + + if (!test_mcontext(1)) + ksft_test_result_pass("mcontext\n"); + else + ksft_test_result_fail("mcontext failed!\n"); + + ksft_finished(); +} diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index ca1468865b1400..61f529e5f26279 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -13,15 +13,31 @@ #include #include +/* XXX: FIXME, always trap SIGSYS on nommu, cannot use zpoline path as + * we don't know how to retrieve um_zpoline_enabled from vdso object ??? + */ +#define __VDSO_SYSCALL1(sysnr, ret, a0) { \ + do { \ + asm("syscall" \ + : "=a" (ret) \ + : "0" (sysnr), "D" (a0) \ + : "rcx", "r11", "memory"); \ + } while (0); \ + } +#define __VDSO_SYSCALL2(sysnr, ret, a0, a1) { \ + do { \ + asm("syscall" \ + : "=a" (ret) \ + : "0" (sysnr), "D" (a0), "S" (a1) \ + : "rcx", "r11", "memory"); \ + } while (0); \ + } + int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { long ret; - asm("syscall" - : "=a" (ret) - : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) - : "rcx", "r11", "memory"); - + __VDSO_SYSCALL2(__NR_clock_gettime, ret, clock, ts); return ret; } int clock_gettime(clockid_t, struct __kernel_timespec *) @@ -31,11 +47,7 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { long ret; - asm("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday), "D" (tv), "S" (tz) - : "rcx", "r11", "memory"); - + __VDSO_SYSCALL2(__NR_gettimeofday, ret, tv, tz); return ret; } int gettimeofday(struct __kernel_old_timeval *, struct timezone *) @@ -45,10 +57,7 @@ __kernel_old_time_t __vdso_time(__kernel_old_time_t *t) { long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); - + __VDSO_SYSCALL1(__NR_time, secs, t); return secs; } __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 51a2b9f2eca95d..0799b3fe75211c 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -9,6 +9,7 @@ #include #include #include +#include unsigned long um_vdso_addr; static struct page *um_vdso; @@ -20,18 +21,29 @@ static int __init init_vdso(void) { BUG_ON(vdso_end - vdso_start > PAGE_SIZE); - um_vdso_addr = task_size - PAGE_SIZE; - um_vdso = alloc_page(GFP_KERNEL); if (!um_vdso) panic("Cannot allocate vdso\n"); copy_page(page_address(um_vdso), vdso_start); +#ifdef CONFIG_MMU + um_vdso_addr = task_size - PAGE_SIZE; +#else + /* this is fine with NOMMU as everything is accessible */ + um_vdso_addr = (unsigned long)page_address(um_vdso); + os_protect_memory((void *)um_vdso_addr, vdso_end - vdso_start, 1, 0, 1); +#endif + + pr_info("vdso_start=%lx um_vdso_addr=%lx pg_um_vdso=%lx", + (unsigned long)vdso_start, um_vdso_addr, + (unsigned long)page_address(um_vdso)); + return 0; } subsys_initcall(init_vdso); +#ifdef CONFIG_MMU int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct vm_area_struct *vma; @@ -53,3 +65,4 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return IS_ERR(vma) ? PTR_ERR(vma) : 0; } +#endif diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 1949e25c7741b1..0a92bebd5f759d 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || RISCV || SUPERH || XTENSA) && !MMU) + depends on ARM || ((M68K || RISCV || SUPERH || UML || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 48fd2de3bca052..17bd590f2fcdec 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -175,6 +175,12 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, return 0; } +int __weak elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params) +{ + return 0; +} + /*****************************************************************************/ /* * load an fdpic binary into various bits of memory @@ -458,6 +464,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) dynaddr); #endif + retval = elf_arch_finalize_exec(&exec_params, &interp_params); + if (retval) + goto error; + finalize_exec(bprm); /* everything is now ready... get the userspace context ready to roll */ entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; diff --git a/fs/exec.c b/fs/exec.c index 2e3a6593c6fd54..75c8c07e65fbf9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -1891,6 +1892,7 @@ int kernel_execve(const char *kernel_filename, return bprm_execve(bprm); } +EXPORT_SYMBOL_FOR_MODULES(kernel_execve, "kunit-uapi"); void set_binfmt(struct linux_binfmt *new) { diff --git a/fs/file.c b/fs/file.c index 0a4f3bdb2dec62..0add929178595f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1362,6 +1362,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) spin_unlock(&files->file_lock); return err; } +EXPORT_SYMBOL_FOR_MODULES(replace_fd, "kunit-uapi"); /** * receive_fd() - Install received file into file descriptor table diff --git a/fs/filesystems.c b/fs/filesystems.c index 0c7d2b7ac26c84..a86ac675040477 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Handling of filesystem drivers list. @@ -45,6 +46,7 @@ void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } +EXPORT_SYMBOL_FOR_MODULES(put_filesystem, "kunit-uapi"); static struct file_system_type **find_filesystem(const char *name, unsigned len) { diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 394875d06fd606..013b11eca7b480 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -43,6 +43,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path) if (old_pwd.dentry) path_put(&old_pwd); } +EXPORT_SYMBOL_FOR_MODULES(set_fs_pwd, "kunit-uapi"); static inline int replace_path(struct path *p, const struct path *old, const struct path *new) { diff --git a/fs/pipe.c b/fs/pipe.c index 9e6a0147581598..eb176ecbe8c057 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -978,6 +979,7 @@ int create_pipe_files(struct file **res, int flags) file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM); return 0; } +EXPORT_SYMBOL_FOR_MODULES(create_pipe_files, "kunit-uapi"); static int __do_pipe_flags(int *fd, struct file **files, int flags) { diff --git a/include/kunit/uapi.h b/include/kunit/uapi.h new file mode 100644 index 00000000000000..a5c923f5d82a91 --- /dev/null +++ b/include/kunit/uapi.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KUnit Userspace testing API. + * + * Copyright (C) 2025, Linutronix GmbH. + * Author: Thomas Weißschuh + */ + +#ifndef _KUNIT_UAPI_H +#define _KUNIT_UAPI_H + +#include + +struct kunit; + +/** + * struct kunit_uapi_blob - Blob embedded build artifact + * @path: Path of the embedded artifact. + * @data: Start of the embedded data in memory. + * @end: End of the embedded data in memory. + */ +struct kunit_uapi_blob { + const char *const path; + const u8 *data; + const u8 *end; +}; + +#if IS_ENABLED(CONFIG_KUNIT_UAPI) + +/** + * KUNIT_UAPI_EMBED_BLOB() - Embed another build artifact into the kernel + * @_name: The name of symbol under which the artifact is embedded. + * @_path: Path to the artifact on disk. + * + * Embeds a build artifact like a userspace executable into the kernel or current module. + * The build artifact is read from disk and needs to be already built. + */ +#define KUNIT_UAPI_EMBED_BLOB(_name, _path) \ + asm ( \ + " .pushsection .rodata, \"a\" \n" \ + " .global " __stringify(CONCATENATE(_name, _data)) " \n" \ + __stringify(CONCATENATE(_name, _data)) ": \n" \ + " .incbin " __stringify(_path) " \n" \ + " .size " __stringify(CONCATENATE(_name, _data)) ", " \ + ". - " __stringify(CONCATENATE(_name, _data)) " \n" \ + " .global " __stringify(CONCATENATE(_name, _end)) " \n" \ + __stringify(CONCATENATE(_name, _end)) ": \n" \ + " .popsection \n" \ + ); \ + \ + extern const char CONCATENATE(_name, _data)[]; \ + extern const char CONCATENATE(_name, _end)[]; \ + \ + static const struct kunit_uapi_blob _name = { \ + .path = _path, \ + .data = CONCATENATE(_name, _data), \ + .end = CONCATENATE(_name, _end), \ + } \ + +#else /* !CONFIG_KUNIT_UAPI */ + +/* Unresolved external reference, to be optimized away */ +#define KUNIT_UAPI_EMBED_BLOB(_name, _path) \ + extern const struct kunit_uapi_blob _name + +#endif /* CONFIG_KUNIT_UAPI */ + +/** + * kunit_uapi_run_kselftest() - Run a userspace kselftest as part of kunit + * @test: The test context object. + * @executable: kselftest executable to run + * + * Runs the kselftest and forwards its TAP output and exit status to kunit. + */ +void kunit_uapi_run_kselftest(struct kunit *test, const struct kunit_uapi_blob *executable); + +#endif /* _KUNIT_UAPI_H */ diff --git a/include/linux/elf-fdpic.h b/include/linux/elf-fdpic.h index e533f45131945f..e7fd85a1d10f6f 100644 --- a/include/linux/elf-fdpic.h +++ b/include/linux/elf-fdpic.h @@ -56,4 +56,7 @@ extern void elf_fdpic_arch_lay_out_mm(struct elf_fdpic_params *exec_params, unsigned long *start_brk); #endif +extern int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params); + #endif /* _LINUX_ELF_FDPIC_H */ diff --git a/init/Kconfig b/init/Kconfig index e95d4345785186..d1bc9d9f6acea9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -91,6 +91,13 @@ config CC_CAN_LINK default $(cc_can_link_user,$(m64-flag)) if 64BIT default $(cc_can_link_user,$(m32-flag)) +config CC_CAN_LINK_STATIC + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) + +source "init/Kconfig.nolibc" + # Fixed in GCC 14, 13.3, 12.4 and 11.5 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 config GCC_ASM_GOTO_OUTPUT_BROKEN diff --git a/init/Kconfig.nolibc b/init/Kconfig.nolibc new file mode 100644 index 00000000000000..29cbc5437e70cb --- /dev/null +++ b/init/Kconfig.nolibc @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +config ARCH_HAS_NOLIBC + bool + default y if ARM + default y if ARM64 + default y if LOONGARCH + default y if M68K + default y if MIPS + default y if PPC + default y if RISCV + default y if S390 + default y if SPARC + default y if UML_X86 + default y if X86 diff --git a/init/Makefile.nolibc b/init/Makefile.nolibc new file mode 100644 index 00000000000000..69450f89bd620f --- /dev/null +++ b/init/Makefile.nolibc @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# Compiler flags, which are necessary to build userspace applications with the +# in-kernel libc "nolibc". + +ifeq ($(CONFIG_MMU),y) + NOLIBC_STATIC_FLAG := -static +else + NOLIBC_STATIC_FLAG := -static-pie +endif + +ifeq ($(and $(CONFIG_ARCH_HAS_NOLIBC),$(CONFIG_HEADERS_INSTALL)),y) + +NOLIBC_USERCFLAGS := -nostdlib -nostdinc $(NOLIBC_STATIC_FLAG) -ffreestanding \ + -fno-asynchronous-unwind-tables -fno-stack-protector \ + -isystem $(objtree)/usr/include -isystem $(srctree)/tools/include/nolibc/ + +NOLIBC_USERLDFLAGS := -nostdlib -nostdinc $(NOLIBC_STATIC_FLAG) + +endif # CONFIG_ARCH_HAS_NOLIBC && CONFIG_HEADERS_INSTALL diff --git a/kernel/exit.c b/kernel/exit.c index 8a87021211ae72..b44677d6fa73c6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -71,6 +71,7 @@ #include #include #include +#include #include @@ -1013,6 +1014,7 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } +EXPORT_SYMBOL_FOR_MODULES(do_exit, "kunit-uapi"); void __noreturn make_task_dead(int signr) { @@ -1895,6 +1897,7 @@ int kernel_wait(pid_t pid, int *stat) put_pid(wo.wo_pid); return ret; } +EXPORT_SYMBOL_FOR_MODULES(kernel_wait, "kunit-uapi"); SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a434..8edbc4f3f5a401 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include @@ -2729,6 +2730,7 @@ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) return kernel_clone(&args); } +EXPORT_SYMBOL_FOR_MODULES(user_mode_thread, "kunit-uapi"); #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index 498cc51e493dc9..24db2c00238225 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -141,4 +141,18 @@ config KUNIT_UML_PCI If unsure, say N. +config KUNIT_UAPI + tristate "KUnit UAPI testing framework" + depends on KUNIT + depends on CC_CAN_LINK_STATIC || ARCH_HAS_NOLIBC + depends on !LTO_CLANG # https://github.com/llvm/llvm-project/issues/112920 + select HEADERS_INSTALL + default KUNIT + help + Enables support for building and running userspace selftests as part of kunit. + These tests should be statically linked and use kselftest.h or kselftest_harness.h + for status reporting. + + In most cases this should be left as Y. + endif # KUNIT diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile index 656f1fa35abcc6..ac7b476c437a5f 100644 --- a/lib/kunit/Makefile +++ b/lib/kunit/Makefile @@ -1,3 +1,5 @@ +include $(srctree)/init/Makefile.nolibc + obj-$(CONFIG_KUNIT) += kunit.o kunit-objs += test.o \ @@ -12,6 +14,14 @@ kunit-objs += test.o \ device.o \ platform.o +userprogs += uapi-preinit +uapi-preinit-userccflags += $(NOLIBC_USERCFLAGS) \ + -include include/generated/autoconf.h \ + -include $(srctree)/tools/include/linux/kconfig.h +obj-$(CONFIG_KUNIT_UAPI) += kunit-uapi.o + +$(obj)/kunit-uapi.o: $(obj)/uapi-preinit + ifeq ($(CONFIG_KUNIT_DEBUGFS),y) kunit-objs += debugfs.o endif @@ -20,6 +30,14 @@ endif obj-$(if $(CONFIG_KUNIT),y) += hooks.o obj-$(CONFIG_KUNIT_TEST) += kunit-test.o + +userprogs += kunit-test-uapi +kunit-test-uapi-userccflags := $(NOLIBC_USERCFLAGS) + +ifdef CONFIG_KUNIT_UAPI +$(obj)/kunit-test.o: $(obj)/kunit-test-uapi +endif + obj-$(CONFIG_KUNIT_TEST) += platform-test.o # string-stream-test compiles built-in only. @@ -29,3 +47,10 @@ obj-$(CONFIG_KUNIT_TEST) += assert_test.o endif obj-$(CONFIG_KUNIT_EXAMPLE_TEST) += kunit-example-test.o + +userprogs += kunit-example-uapi +kunit-example-uapi-userccflags := $(NOLIBC_USERCFLAGS) + +ifdef CONFIG_KUNIT_UAPI +$(obj)/kunit-example-test.o: $(obj)/kunit-example-uapi +endif diff --git a/lib/kunit/kunit-example-test.c b/lib/kunit/kunit-example-test.c index 9452b163956f3e..3c0146fd9059ff 100644 --- a/lib/kunit/kunit-example-test.c +++ b/lib/kunit/kunit-example-test.c @@ -8,6 +8,7 @@ #include #include +#include /* * This is the most fundamental element of KUnit, the test case. A test case @@ -489,6 +490,19 @@ static void example_params_test_with_init_dynamic_arr(struct kunit *test) KUNIT_EXPECT_EQ(test, param_val - param_val, 0); } +/* + * This test shows the usage of UAPI tests. + */ +static void example_uapi_test(struct kunit *test) +{ + KUNIT_UAPI_EMBED_BLOB(kunit_example_uapi, "kunit-example-uapi"); + + if (IS_ENABLED(CONFIG_KUNIT_UAPI)) + kunit_uapi_run_kselftest(test, &kunit_example_uapi); + else + kunit_skip(test, "CONFIG_KUNIT_UAPI is not enabled"); +} + /* * Here we make a list of all the test cases we want to add to the test suite * below. @@ -514,6 +528,7 @@ static struct kunit_case example_test_cases[] = { kunit_array_gen_params, example_param_init_dynamic_arr, example_param_exit_dynamic_arr), KUNIT_CASE_SLOW(example_slow_test), + KUNIT_CASE(example_uapi_test), {} }; diff --git a/lib/kunit/kunit-example-uapi.c b/lib/kunit/kunit-example-uapi.c new file mode 100644 index 00000000000000..4ce657050dd4a5 --- /dev/null +++ b/lib/kunit/kunit-example-uapi.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit Userspace example test. + * + * Copyright (C) 2025, Linutronix GmbH. + * Author: Thomas Weißschuh + * + * This is *userspace* code. + */ + +#include "../../tools/testing/selftests/kselftest.h" + +int main(void) +{ + ksft_print_header(); + ksft_set_plan(4); + ksft_test_result_pass("userspace test 1\n"); + ksft_test_result_pass("userspace test 2\n"); + ksft_test_result_skip("userspace test 3: some reason\n"); + ksft_test_result_pass("userspace test 4\n"); + ksft_finished(); +} diff --git a/lib/kunit/kunit-test-uapi.c b/lib/kunit/kunit-test-uapi.c new file mode 100644 index 00000000000000..ec5395d809ee2a --- /dev/null +++ b/lib/kunit/kunit-test-uapi.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit Userspace selftest. + * + * Copyright (C) 2025, Linutronix GmbH. + * Author: Thomas Weißschuh + * + * This is *userspace* code. + */ + +#include +#include +#include + +#include "../../tools/testing/selftests/kselftest.h" + +static void test_procfs(void) +{ + char buf[256]; + ssize_t r; + int fd; + + fd = open("/proc/self/comm", O_RDONLY); + if (fd == -1) { + ksft_test_result_fail("procfs: open() failed: %s\n", strerror(errno)); + return; + } + + r = read(fd, buf, sizeof(buf)); + close(fd); + + if (r == -1) { + ksft_test_result_fail("procfs: read() failed: %s\n", strerror(errno)); + return; + } + + if (r != 16 || strncmp("kunit-test-uapi\n", buf, 16) != 0) { + ksft_test_result_fail("procfs: incorrect comm\n"); + return; + } + + ksft_test_result_pass("procfs\n"); +} + +int main(void) +{ + ksft_print_header(); + ksft_set_plan(1); + test_procfs(); + ksft_finished(); +} diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index 63130a48e23712..5ffc1a33349737 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -915,10 +916,30 @@ static struct kunit_suite kunit_stub_test_suite = { .test_cases = kunit_stub_test_cases, }; +static void kunit_uapi_test(struct kunit *test) +{ + KUNIT_UAPI_EMBED_BLOB(kunit_test_uapi, "kunit-test-uapi"); + + if (IS_ENABLED(CONFIG_KUNIT_UAPI)) + kunit_uapi_run_kselftest(test, &kunit_test_uapi); + else + kunit_skip(test, "CONFIG_KUNIT_UAPI is not enabled"); +} + +static struct kunit_case kunit_uapi_test_cases[] = { + KUNIT_CASE(kunit_uapi_test), + {} +}; + +static struct kunit_suite kunit_uapi_test_suite = { + .name = "kunit_uapi", + .test_cases = kunit_uapi_test_cases, +}; + kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite, &kunit_log_test_suite, &kunit_status_test_suite, &kunit_current_test_suite, &kunit_device_test_suite, - &kunit_fault_test_suite, &kunit_stub_test_suite); + &kunit_fault_test_suite, &kunit_stub_test_suite, &kunit_uapi_test_suite); MODULE_DESCRIPTION("KUnit test for core test infrastructure"); MODULE_LICENSE("GPL v2"); diff --git a/lib/kunit/kunit-uapi.c b/lib/kunit/kunit-uapi.c new file mode 100644 index 00000000000000..7c87605b9ded9d --- /dev/null +++ b/lib/kunit/kunit-uapi.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit Userspace testing API. + * + * Copyright (C) 2025, Linutronix GmbH. + * Author: Thomas Weißschuh + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KSFT_PASS 0 +#define KSFT_FAIL 1 +#define KSFT_XFAIL 2 +#define KSFT_XPASS 3 +#define KSFT_SKIP 4 + +KUNIT_UAPI_EMBED_BLOB(kunit_uapi_preinit, "uapi-preinit"); + +static struct vfsmount *kunit_uapi_mount_ramfs(void) +{ + struct file_system_type *type; + struct vfsmount *mnt; + + type = get_fs_type("ramfs"); + if (!type) + return ERR_PTR(-ENODEV); + + /* FIXME + * The mount setup is supposed to look like this: + * kunit_uapi_mount_ramfs() sets up a private mount, + * with nothing visible except the new tmpfs. + * Then each executable execution gets a new namespace on top of that + * on which it can mount whatever it needs. + * However I didn't manage to set this up, so keep everything simple + * for now and let somebody familiar with the VFS figure this out. + */ + + mnt = kern_mount(type); + put_filesystem(type); + + return mnt; +} + +static int kunit_uapi_write_file(struct vfsmount *mnt, const char *name, mode_t mode, + const u8 *data, size_t size) +{ + struct file *file; + ssize_t written; + + file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, mode); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, size, NULL); + filp_close(file, NULL); + if (written != size) { + if (written >= 0) + return -ENOMEM; + return written; + } + + /* Flush delayed fput so exec can open the file read-only */ + flush_delayed_fput(); + + return 0; +} + +static int kunit_uapi_write_executable(struct vfsmount *mnt, + const struct kunit_uapi_blob *executable) +{ + return kunit_uapi_write_file(mnt, kbasename(executable->path), 0755, + executable->data, executable->end - executable->data); +} + +struct kunit_uapi_user_mode_thread_ctx { + const char *executable; + + /* Signals mnt, out, pwd and tgid */ + struct completion setup_done; + struct vfsmount *mnt; + struct file *out; + struct path pwd; + pid_t tgid; + + /* Valid after wait(tgid) */ + int exec_err; +}; + +static int kunit_uapi_user_mode_thread_init(void *data) +{ + struct kunit_uapi_user_mode_thread_ctx *ctx = data; + const char *const argv[] = { + ctx->executable, + NULL + }; + struct file *out[2]; + int err; + + err = create_pipe_files(out, 0); + if (err) + return err; + + /* stdin, use the *write* end to the pipe to have an unreadable input */ + err = replace_fd(0, out[1], 0); + if (err < 0) { + fput(out[0]); + fput(out[1]); + return err; + } + + /* stdout */ + err = replace_fd(1, out[1], 0); + if (err < 0) { + replace_fd(0, NULL, 0); + fput(out[0]); + fput(out[1]); + return err; + } + + /* stderr */ + err = replace_fd(2, out[1], 0); + if (err < 0) { + replace_fd(0, NULL, 0); + replace_fd(1, NULL, 0); + fput(out[0]); + fput(out[1]); + return err; + } + + fput(out[1]); + + ctx->out = out[0]; + ctx->tgid = current->tgid; + + set_fs_pwd(current->fs, &ctx->pwd); + kernel_sigaction(SIGKILL, SIG_DFL); + kernel_sigaction(SIGABRT, SIG_DFL); + + complete(&ctx->setup_done); + ctx->exec_err = kernel_execve(kbasename(kunit_uapi_preinit.path), argv, NULL); + if (!ctx->exec_err) + return 0; + do_exit(0); +} + +static size_t kunit_uapi_printk_subtest_lines(struct kunit *test, char *buf, size_t s) +{ + const char *ptr = buf, *newline; + size_t n; + + while (s) { + newline = strnchr(ptr, s, '\n'); + if (!newline) + break; + + n = newline - ptr + 1; + + kunit_log(KERN_INFO, test, KUNIT_SUBSUBTEST_INDENT "%.*s", (int)n, ptr); + ptr += n; + s -= n; + } + + memmove(buf, ptr, s); + + return s; +} + +static int kunit_uapi_forward_to_printk(struct kunit *test, struct file *output) +{ + /* + * printk() automatically adds a newline after each message. + * Therefore only fully accumulated lines can be forwarded. + * Each line needs to fit into the buffer below. + */ + char buf[512]; + size_t s = 0; + ssize_t n; + + while (1) { + n = kernel_read(output, buf + s, sizeof(buf) - s, NULL); + if (n <= 0) + return n; + s = kunit_uapi_printk_subtest_lines(test, buf, s + n); + } +} + +static void kunit_uapi_kill_pid(pid_t pid) +{ + struct pid *p; + + p = find_get_pid(pid); + kill_pid(p, SIGKILL, 1); + put_pid(p); +} + +static int kunit_uapi_run_executable_in_mount(struct kunit *test, const char *executable, + struct vfsmount *mnt) +{ + struct kunit_uapi_user_mode_thread_ctx ctx = { + .setup_done = COMPLETION_INITIALIZER_ONSTACK(ctx.setup_done), + .executable = executable, + .pwd = { + .mnt = mnt, + .dentry = mnt->mnt_root, + }, + }; + int forward_err, wait_err, ret; + pid_t pid; + + /* If SIGCHLD is ignored do_wait won't populate the status. */ + kernel_sigaction(SIGCHLD, SIG_DFL); + pid = user_mode_thread(kunit_uapi_user_mode_thread_init, &ctx, SIGCHLD); + if (pid < 0) { + kernel_sigaction(SIGCHLD, SIG_IGN); + return pid; + } + + wait_for_completion(&ctx.setup_done); + + forward_err = kunit_uapi_forward_to_printk(test, ctx.out); + if (forward_err) + kunit_uapi_kill_pid(ctx.tgid); + + wait_err = kernel_wait(ctx.tgid, &ret); + + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + + if (ctx.exec_err) + return ctx.exec_err; + if (forward_err) + return forward_err; + if (wait_err < 0) + return wait_err; + return ret; +} + +static int kunit_uapi_run_executable(struct kunit *test, + const struct kunit_uapi_blob *executable) +{ + const char *exe_name = kbasename(executable->path); + struct vfsmount *mnt; + int err; + + mnt = kunit_uapi_mount_ramfs(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + err = kunit_uapi_write_executable(mnt, &kunit_uapi_preinit); + + if (!err) + err = kunit_uapi_write_executable(mnt, executable); + + if (!err) + err = kunit_uapi_run_executable_in_mount(test, exe_name, mnt); + + kern_unmount(mnt); + + return err; +} + +void kunit_uapi_run_kselftest(struct kunit *test, const struct kunit_uapi_blob *executable) +{ + u8 exit_code, exit_signal; + int err; + + err = kunit_uapi_run_executable(test, executable); + if (err < 0) + KUNIT_FAIL_AND_ABORT(test, "Could not run test executable: %pe\n", ERR_PTR(err)); + + exit_code = err >> 8; + exit_signal = err & 0xff; + + if (exit_signal) + KUNIT_FAIL_AND_ABORT(test, "kselftest exited with signal: %d\n", exit_signal); + else if (exit_code == KSFT_PASS) + ; /* Noop */ + else if (exit_code == KSFT_FAIL) + KUNIT_FAIL_AND_ABORT(test, "kselftest exited with code KSFT_FAIL\n"); + else if (exit_code == KSFT_XPASS) + KUNIT_FAIL_AND_ABORT(test, "kselftest exited with code KSFT_XPASS\n"); + else if (exit_code == KSFT_XFAIL) + ; /* Noop */ + else if (exit_code == KSFT_SKIP) + kunit_mark_skipped(test, "kselftest exited with code KSFT_SKIP\n"); + else + KUNIT_FAIL_AND_ABORT(test, "kselftest exited with unknown exit code: %d\n", + exit_code); +} +EXPORT_SYMBOL_GPL(kunit_uapi_run_kselftest); + +MODULE_DESCRIPTION("KUnit UAPI testing framework"); +MODULE_AUTHOR("Thomas Weißschuh + * + * This is *userspace* code. + */ + +#include +#include + +#include "../../tools/testing/selftests/kselftest.h" + +static int setup_api_mount(const char *target, const char *fstype) +{ + int ret; + + ret = mkdir(target, 0755); + if (ret && errno != EEXIST) + return -errno; + + ret = mount("none", target, fstype, 0, NULL); + if (ret && errno != EBUSY) + return -errno; + + return 0; +} + +static void exit_failure(const char *stage, int err) +{ + /* If preinit fails synthesize a failed test report. */ + ksft_print_header(); + ksft_set_plan(1); + ksft_test_result_fail("Failed during test setup: %s: %s\n", stage, strerror(-err)); + ksft_finished(); +} + +int main(int argc, char **argv, char **envp) +{ + int ret; + + ret = setup_api_mount("/proc", "proc"); + if (ret) + exit_failure("mount /proc", ret); + + ret = setup_api_mount("/sys", "sysfs"); + if (ret) + exit_failure("mount /sys", ret); + + if (IS_ENABLED(CONFIG_DEVTMPFS)) { + ret = setup_api_mount("/dev", "devtmpfs"); + if (ret) + exit_failure("mount /dev", ret); + } + + ret = execve(argv[0], argv, envp); + if (ret) + exit_failure("execve", ret); + + return 0; +} diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 9c7f43b9218b3c..a31dd6b84ce190 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -92,6 +92,9 @@ #ifndef _NOLIBC_H #define _NOLIBC_H +/* So that we do not get compatibility types/defines */ +#define __KERNEL__ + #include "std.h" #include "arch.h" #include "types.h" diff --git a/tools/include/nolibc/signal.h b/tools/include/nolibc/signal.h index ac13e53ac31d7c..fa52119e577f84 100644 --- a/tools/include/nolibc/signal.h +++ b/tools/include/nolibc/signal.h @@ -14,6 +14,7 @@ #include "arch.h" #include "types.h" #include "sys.h" +#include /* This one is not marked static as it's needed by libgcc for divide by zero */ int raise(int signal); @@ -23,4 +24,37 @@ int raise(int signal) return sys_kill(sys_getpid(), signal); } +/* + * sigaction(int signum, const struct sigaction *act, struct sigaction *oldact) + */ + +#ifdef SA_RESTORER +__attribute__((naked)) +static void my_sa_restorer(void) +{ + my_syscall0(__NR_rt_sigreturn); +} +#endif + +static __attribute__((unused)) +int sys_sigaction(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + struct sigaction real_act = *act; +#ifdef SA_RESTORER + if (!(real_act.sa_flags & SA_RESTORER)) { + real_act.sa_flags |= SA_RESTORER; + real_act.sa_restorer = my_sa_restorer; + } +#endif + + return my_syscall4(__NR_rt_sigaction, signum, &real_act, oldact, + sizeof(act->sa_mask)); +} + +static __attribute__((unused)) +int sigaction(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + return __sysret(sys_sigaction(signum, act, oldact)); +} + #endif /* _NOLIBC_SIGNAL_H */ diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index 5338489dcbe48c..dd7238e7c6a6c8 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -352,9 +352,9 @@ def parse_test_plan(lines: LineStream, test: Test) -> bool: lines.pop() return True -TEST_RESULT = re.compile(r'^\s*(ok|not ok) ([0-9]+) ?(- )?([^#]*)( # .*)?$') +TEST_RESULT = re.compile(r'^\s*(ok|not ok) ([0-9]+)? ?(- )?([^#]*)( # .*)?$') -TEST_RESULT_SKIP = re.compile(r'^\s*(ok|not ok) ([0-9]+) ?(- )?(.*) # SKIP ?(.*)$') +TEST_RESULT_SKIP = re.compile(r'^\s*(ok|not ok) ([0-9]+)? ?(- )?(.*) # SKIP ?(.*)$') def peek_test_name_match(lines: LineStream, test: Test) -> bool: """ @@ -688,6 +688,11 @@ def bubble_up_test_results(test: Test) -> None: counts.add_status(status) elif test.counts.get_status() == TestStatus.TEST_CRASHED: test.status = TestStatus.TEST_CRASHED + if not test.ok_status(): + for t in subtests: + if not t.ok_status(): + counts.add_status(t.status) + break if status == TestStatus.FAILURE and test.counts.get_status() == TestStatus.SUCCESS: counts.add_status(status) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index b67408147c1faa..c73914e2e363a0 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -172,7 +172,7 @@ def test_parse_failed_nested_tests_log(self): with open(nested_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) - self.assertEqual(result.counts.failed, 2) + self.assertEqual(result.counts.failed, 3) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.subtests[0].subtests[0].status) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status) diff --git a/tools/testing/kunit/qemu_configs/loongarch.py b/tools/testing/kunit/qemu_configs/loongarch.py index a92422967d1da9..1dba755284f11f 100644 --- a/tools/testing/kunit/qemu_configs/loongarch.py +++ b/tools/testing/kunit/qemu_configs/loongarch.py @@ -11,6 +11,8 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_CPU_HAS_LSX=y +CONFIG_CPU_HAS_LASX=y ''', qemu_arch='loongarch64', kernel_path='arch/loongarch/boot/vmlinux.elf',