diff mbox series

[v1,2/2] tests/pid_namespace: add pid_max tests

Message ID	20240222160915.315255-3-aleksandr.mikhalitsyn@canonical.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel+bounces-76857-ouuuleilei=gmail.com@vger.kernel.org designates 2604:1380:40f1:3f00::1 as permitted sender) client-ip=2604:1380:40f1:3f00::1; From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com> To: brauner@kernel.org Cc: stgraber@stgraber.org, tycho@tycho.pizza, cyphar@cyphar.com, linux-kernel@vger.kernel.org, Christian Brauner <christian.brauner@ubuntu.com>, Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com> Subject: [PATCH v1 2/2] tests/pid_namespace: add pid_max tests Date: Thu, 22 Feb 2024 17:09:15 +0100 Message-Id: <20240222160915.315255-3-aleksandr.mikhalitsyn@canonical.com> In-Reply-To: <20240222160915.315255-1-aleksandr.mikhalitsyn@canonical.com> References: <20240222160915.315255-1-aleksandr.mikhalitsyn@canonical.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-getmail-retrieved-from-mailbox: INBOX
Series	pid_namespace: namespacify sysctl kernel.pid_max \| [v1,0/2] pid_namespace: namespacify sysctl kernel.pid_max [v1,1/2] pid: allow pid_max to be set per pid namespace [v1,2/2] tests/pid_namespace: add pid_max tests

Commit Message

Aleksandr Mikhalitsyn Feb. 22, 2024, 4:09 p.m. UTC

  From: Christian Brauner <christian.brauner@ubuntu.com>

Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
---
 .../selftests/pid_namespace/.gitignore        |   1 +
 .../testing/selftests/pid_namespace/Makefile  |   2 +-
 .../testing/selftests/pid_namespace/pid_max.c | 358 ++++++++++++++++++
 3 files changed, 360 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/pid_namespace/pid_max.c

Comments

Tycho Andersen Feb. 22, 2024, 4:54 p.m. UTC | #1

On Thu, Feb 22, 2024 at 05:09:15PM +0100, Alexander Mikhalitsyn wrote:
> +static int pid_max_nested_limit_inner(void *data)
> +{
> +	int fret = -1, nr_procs = 400;
> +	int fd, ret;
> +	pid_t pid;
> +	pid_t pids[1000];
> +
> +	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
> +	if (ret) {
> +		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
> +		return fret;
> +	}
> +
> +	umount2("/proc", MNT_DETACH);
> +
> +	ret = mount("proc", "/proc", "proc", 0, NULL);
> +	if (ret) {
> +		fprintf(stderr, "%m - Failed to mount proc\n");
> +		return fret;
> +	}
> +
> +	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
> +	if (fd < 0) {
> +		fprintf(stderr, "%m - Failed to open pid_max\n");
> +		return fret;
> +	}
> +
> +	ret = write(fd, "500", sizeof("500") - 1);
> +	close(fd);
> +	if (ret < 0) {
> +		fprintf(stderr, "%m - Failed to write pid_max\n");
> +		return fret;
> +	}
> +
> +	for (nr_procs = 0; nr_procs < 500; nr_procs++) {
> +		pid = fork();
> +		if (pid < 0)
> +			break;
> +
> +		if (pid == 0)
> +			exit(EXIT_SUCCESS);
> +
> +		pids[nr_procs] = pid;
> +	}
> +
> +	if (nr_procs >= 400) {
> +		fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
> +		goto reap;
> +	}

A small quibble, but I wonder about the semantics here. "You can write
whatever you want to this file, but we'll ignore it sometimes" seems
weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
in this case? I suppose they read pid_max first, they'll be able to
tell it's impossible and can exit(1), but returning E2BIG from write()
might be more useful.

Tycho

Christian Brauner Feb. 23, 2024, 4:24 p.m. UTC | #2

On Thu, Feb 22, 2024 at 09:54:08AM -0700, Tycho Andersen wrote:
> On Thu, Feb 22, 2024 at 05:09:15PM +0100, Alexander Mikhalitsyn wrote:
> > +static int pid_max_nested_limit_inner(void *data)
> > +{
> > +	int fret = -1, nr_procs = 400;
> > +	int fd, ret;
> > +	pid_t pid;
> > +	pid_t pids[1000];
> > +
> > +	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
> > +	if (ret) {
> > +		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
> > +		return fret;
> > +	}
> > +
> > +	umount2("/proc", MNT_DETACH);
> > +
> > +	ret = mount("proc", "/proc", "proc", 0, NULL);
> > +	if (ret) {
> > +		fprintf(stderr, "%m - Failed to mount proc\n");
> > +		return fret;
> > +	}
> > +
> > +	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
> > +	if (fd < 0) {
> > +		fprintf(stderr, "%m - Failed to open pid_max\n");
> > +		return fret;
> > +	}
> > +
> > +	ret = write(fd, "500", sizeof("500") - 1);
> > +	close(fd);
> > +	if (ret < 0) {
> > +		fprintf(stderr, "%m - Failed to write pid_max\n");
> > +		return fret;
> > +	}
> > +
> > +	for (nr_procs = 0; nr_procs < 500; nr_procs++) {
> > +		pid = fork();
> > +		if (pid < 0)
> > +			break;
> > +
> > +		if (pid == 0)
> > +			exit(EXIT_SUCCESS);
> > +
> > +		pids[nr_procs] = pid;
> > +	}
> > +
> > +	if (nr_procs >= 400) {
> > +		fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
> > +		goto reap;
> > +	}
> 
> A small quibble, but I wonder about the semantics here. "You can write
> whatever you want to this file, but we'll ignore it sometimes" seems
> weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> in this case? I suppose they read pid_max first, they'll be able to
> tell it's impossible and can exit(1), but returning E2BIG from write()
> might be more useful.

That's a good idea. But it's a bit tricky. The straightforward thing is
to walk upwards through all ancestor pid namespaces and use the lowest
pid_max value as the upper bound for the current pid namespace. This
will guarantee that you get an error when you try to write a value that
you would't be able to create. The same logic should probably apply to
ns_last_pid as well.

However, that still leaves cases where the current pid namespace writes
a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
above that limit.). But then immediately afterwards an ancestor pid
namespace lowers the pid_max limit. So you can always end up in a
scenario like this.

Tycho Andersen Feb. 24, 2024, 2:59 p.m. UTC | #3

On Fri, Feb 23, 2024 at 05:24:03PM +0100, Christian Brauner wrote:
> On Thu, Feb 22, 2024 at 09:54:08AM -0700, Tycho Andersen wrote:
> > On Thu, Feb 22, 2024 at 05:09:15PM +0100, Alexander Mikhalitsyn wrote:
> > > +static int pid_max_nested_limit_inner(void *data)
> > > +{
> > > +	int fret = -1, nr_procs = 400;
> > > +	int fd, ret;
> > > +	pid_t pid;
> > > +	pid_t pids[1000];
> > > +
> > > +	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
> > > +	if (ret) {
> > > +		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
> > > +		return fret;
> > > +	}
> > > +
> > > +	umount2("/proc", MNT_DETACH);
> > > +
> > > +	ret = mount("proc", "/proc", "proc", 0, NULL);
> > > +	if (ret) {
> > > +		fprintf(stderr, "%m - Failed to mount proc\n");
> > > +		return fret;
> > > +	}
> > > +
> > > +	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
> > > +	if (fd < 0) {
> > > +		fprintf(stderr, "%m - Failed to open pid_max\n");
> > > +		return fret;
> > > +	}
> > > +
> > > +	ret = write(fd, "500", sizeof("500") - 1);
> > > +	close(fd);
> > > +	if (ret < 0) {
> > > +		fprintf(stderr, "%m - Failed to write pid_max\n");
> > > +		return fret;
> > > +	}
> > > +
> > > +	for (nr_procs = 0; nr_procs < 500; nr_procs++) {
> > > +		pid = fork();
> > > +		if (pid < 0)
> > > +			break;
> > > +
> > > +		if (pid == 0)
> > > +			exit(EXIT_SUCCESS);
> > > +
> > > +		pids[nr_procs] = pid;
> > > +	}
> > > +
> > > +	if (nr_procs >= 400) {
> > > +		fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
> > > +		goto reap;
> > > +	}
> > 
> > A small quibble, but I wonder about the semantics here. "You can write
> > whatever you want to this file, but we'll ignore it sometimes" seems
> > weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> > in this case? I suppose they read pid_max first, they'll be able to
> > tell it's impossible and can exit(1), but returning E2BIG from write()
> > might be more useful.
> 
> That's a good idea. But it's a bit tricky. The straightforward thing is
> to walk upwards through all ancestor pid namespaces and use the lowest
> pid_max value as the upper bound for the current pid namespace. This
> will guarantee that you get an error when you try to write a value that
> you would't be able to create. The same logic should probably apply to
> ns_last_pid as well.
> 
> However, that still leaves cases where the current pid namespace writes
> a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
> above that limit.). But then immediately afterwards an ancestor pid
> namespace lowers the pid_max limit. So you can always end up in a
> scenario like this.

I wonder if we can push edits down too? Or an render .effective file, like
cgroups, though I prefer just putting the right thing in pid_max.

Tycho

Christian Brauner Feb. 26, 2024, 8:57 a.m. UTC | #4

> > > A small quibble, but I wonder about the semantics here. "You can write
> > > whatever you want to this file, but we'll ignore it sometimes" seems
> > > weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> > > in this case? I suppose they read pid_max first, they'll be able to
> > > tell it's impossible and can exit(1), but returning E2BIG from write()
> > > might be more useful.
> > 
> > That's a good idea. But it's a bit tricky. The straightforward thing is
> > to walk upwards through all ancestor pid namespaces and use the lowest
> > pid_max value as the upper bound for the current pid namespace. This
> > will guarantee that you get an error when you try to write a value that
> > you would't be able to create. The same logic should probably apply to
> > ns_last_pid as well.
> > 
> > However, that still leaves cases where the current pid namespace writes
> > a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
> > above that limit.). But then immediately afterwards an ancestor pid
> > namespace lowers the pid_max limit. So you can always end up in a
> > scenario like this.
> 
> I wonder if we can push edits down too? Or an render .effective file, like

I don't think that works in the current design? The pid_max value is per
struct pid_namespace. And while there is a 1:1 relationship between a
child pid namespace to all of its ancestor pid namespaces there's a 1 to
many relationship between a pid namespace and it's child pid namespaces.
IOW, if you change pid_max in pidns_level_1 then you'd have to go
through each of the child pid namespaces on pidns_level_2 which could be
thousands. So you could only do this lazily. IOW, compare and possibly
update the pid_max value of the child pid namespace everytime it's read
or written. Maybe that .effective is the way to go; not sure right now.

Tycho Andersen Feb. 26, 2024, 3:30 p.m. UTC | #5

On Mon, Feb 26, 2024 at 09:57:47AM +0100, Christian Brauner wrote:
> > > > A small quibble, but I wonder about the semantics here. "You can write
> > > > whatever you want to this file, but we'll ignore it sometimes" seems
> > > > weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> > > > in this case? I suppose they read pid_max first, they'll be able to
> > > > tell it's impossible and can exit(1), but returning E2BIG from write()
> > > > might be more useful.
> > > 
> > > That's a good idea. But it's a bit tricky. The straightforward thing is
> > > to walk upwards through all ancestor pid namespaces and use the lowest
> > > pid_max value as the upper bound for the current pid namespace. This
> > > will guarantee that you get an error when you try to write a value that
> > > you would't be able to create. The same logic should probably apply to
> > > ns_last_pid as well.
> > > 
> > > However, that still leaves cases where the current pid namespace writes
> > > a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
> > > above that limit.). But then immediately afterwards an ancestor pid
> > > namespace lowers the pid_max limit. So you can always end up in a
> > > scenario like this.
> > 
> > I wonder if we can push edits down too? Or an render .effective file, like
> 
> I don't think that works in the current design? The pid_max value is per
> struct pid_namespace. And while there is a 1:1 relationship between a
> child pid namespace to all of its ancestor pid namespaces there's a 1 to
> many relationship between a pid namespace and it's child pid namespaces.
> IOW, if you change pid_max in pidns_level_1 then you'd have to go
> through each of the child pid namespaces on pidns_level_2 which could be
> thousands. So you could only do this lazily. IOW, compare and possibly
> update the pid_max value of the child pid namespace everytime it's read
> or written. Maybe that .effective is the way to go; not sure right now.

I wonder then, does it make sense to implement this as a cgroup thing
instead, which is used to doing this kind of traversal?

Or I suppose not, since the idea is to get legacy software that's
writing to pid_max to work?

Tycho

Christian Brauner Feb. 26, 2024, 3:45 p.m. UTC | #6

On Mon, Feb 26, 2024 at 08:30:35AM -0700, Tycho Andersen wrote:
> On Mon, Feb 26, 2024 at 09:57:47AM +0100, Christian Brauner wrote:
> > > > > A small quibble, but I wonder about the semantics here. "You can write
> > > > > whatever you want to this file, but we'll ignore it sometimes" seems
> > > > > weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> > > > > in this case? I suppose they read pid_max first, they'll be able to
> > > > > tell it's impossible and can exit(1), but returning E2BIG from write()
> > > > > might be more useful.
> > > > 
> > > > That's a good idea. But it's a bit tricky. The straightforward thing is
> > > > to walk upwards through all ancestor pid namespaces and use the lowest
> > > > pid_max value as the upper bound for the current pid namespace. This
> > > > will guarantee that you get an error when you try to write a value that
> > > > you would't be able to create. The same logic should probably apply to
> > > > ns_last_pid as well.
> > > > 
> > > > However, that still leaves cases where the current pid namespace writes
> > > > a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
> > > > above that limit.). But then immediately afterwards an ancestor pid
> > > > namespace lowers the pid_max limit. So you can always end up in a
> > > > scenario like this.
> > > 
> > > I wonder if we can push edits down too? Or an render .effective file, like
> > 
> > I don't think that works in the current design? The pid_max value is per
> > struct pid_namespace. And while there is a 1:1 relationship between a
> > child pid namespace to all of its ancestor pid namespaces there's a 1 to
> > many relationship between a pid namespace and it's child pid namespaces.
> > IOW, if you change pid_max in pidns_level_1 then you'd have to go
> > through each of the child pid namespaces on pidns_level_2 which could be
> > thousands. So you could only do this lazily. IOW, compare and possibly
> > update the pid_max value of the child pid namespace everytime it's read
> > or written. Maybe that .effective is the way to go; not sure right now.
> 
> I wonder then, does it make sense to implement this as a cgroup thing
> instead, which is used to doing this kind of traversal?
> 
> Or I suppose not, since the idea is to get legacy software that's
> writing to pid_max to work?

My personal perspective is that this is not so important. The original
motivation for this had been legacy workloads that expect to only get
pid numbers up to a certain size which would otherwise break. And for
them it doesn't matter whether that setting is applied through pid_max
or via some cgroup setting. All that matters is that they don't get pids
beyond what they expect.

So yes, from my POV we could try and make this a cgroup property. But
we should check with Tejun first whether he'd consider this a useful
addition or not.

Aleksandr Mikhalitsyn Feb. 29, 2024, 3:14 p.m. UTC | #7

On Mon, Feb 26, 2024 at 4:30 PM Tycho Andersen <tycho@tycho.pizza> wrote:
>
> On Mon, Feb 26, 2024 at 09:57:47AM +0100, Christian Brauner wrote:
> > > > > A small quibble, but I wonder about the semantics here. "You can write
> > > > > whatever you want to this file, but we'll ignore it sometimes" seems
> > > > > weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> > > > > in this case? I suppose they read pid_max first, they'll be able to
> > > > > tell it's impossible and can exit(1), but returning E2BIG from write()
> > > > > might be more useful.
> > > >
> > > > That's a good idea. But it's a bit tricky. The straightforward thing is
> > > > to walk upwards through all ancestor pid namespaces and use the lowest
> > > > pid_max value as the upper bound for the current pid namespace. This
> > > > will guarantee that you get an error when you try to write a value that
> > > > you would't be able to create. The same logic should probably apply to
> > > > ns_last_pid as well.
> > > >
> > > > However, that still leaves cases where the current pid namespace writes
> > > > a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
> > > > above that limit.). But then immediately afterwards an ancestor pid
> > > > namespace lowers the pid_max limit. So you can always end up in a
> > > > scenario like this.
> > >
> > > I wonder if we can push edits down too? Or an render .effective file, like
> >
> > I don't think that works in the current design? The pid_max value is per
> > struct pid_namespace. And while there is a 1:1 relationship between a
> > child pid namespace to all of its ancestor pid namespaces there's a 1 to
> > many relationship between a pid namespace and it's child pid namespaces.
> > IOW, if you change pid_max in pidns_level_1 then you'd have to go
> > through each of the child pid namespaces on pidns_level_2 which could be
> > thousands. So you could only do this lazily. IOW, compare and possibly
> > update the pid_max value of the child pid namespace everytime it's read
> > or written. Maybe that .effective is the way to go; not sure right now.

Hi Tycho!

>
> I wonder then, does it make sense to implement this as a cgroup thing
> instead, which is used to doing this kind of traversal?
>
> Or I suppose not, since the idea is to get legacy software that's
> writing to pid_max to work?

Yes, this is mostly for legacy software that expects host-like
behavior in the container.
I know that folks who work on running Android inside the container are
very-very interested in this.

Kind regards,
Alex

>
> Tycho

Aleksandr Mikhalitsyn Feb. 29, 2024, 4:11 p.m. UTC | #8

On Thu, Feb 29, 2024 at 4:14 PM Aleksandr Mikhalitsyn
<aleksandr.mikhalitsyn@canonical.com> wrote:
>
> On Mon, Feb 26, 2024 at 4:30 PM Tycho Andersen <tycho@tycho.pizza> wrote:
> >
> > On Mon, Feb 26, 2024 at 09:57:47AM +0100, Christian Brauner wrote:
> > > > > > A small quibble, but I wonder about the semantics here. "You can write
> > > > > > whatever you want to this file, but we'll ignore it sometimes" seems
> > > > > > weird to me. What if someone (CRIU) wants to spawn a pid numbered 450
> > > > > > in this case? I suppose they read pid_max first, they'll be able to
> > > > > > tell it's impossible and can exit(1), but returning E2BIG from write()
> > > > > > might be more useful.
> > > > >
> > > > > That's a good idea. But it's a bit tricky. The straightforward thing is
> > > > > to walk upwards through all ancestor pid namespaces and use the lowest
> > > > > pid_max value as the upper bound for the current pid namespace. This
> > > > > will guarantee that you get an error when you try to write a value that
> > > > > you would't be able to create. The same logic should probably apply to
> > > > > ns_last_pid as well.
> > > > >
> > > > > However, that still leaves cases where the current pid namespace writes
> > > > > a pid_max limit that is allowed (IOW, all ancestor pid namespaces are
> > > > > above that limit.). But then immediately afterwards an ancestor pid
> > > > > namespace lowers the pid_max limit. So you can always end up in a
> > > > > scenario like this.
> > > >
> > > > I wonder if we can push edits down too? Or an render .effective file, like
> > >
> > > I don't think that works in the current design? The pid_max value is per
> > > struct pid_namespace. And while there is a 1:1 relationship between a
> > > child pid namespace to all of its ancestor pid namespaces there's a 1 to
> > > many relationship between a pid namespace and it's child pid namespaces.
> > > IOW, if you change pid_max in pidns_level_1 then you'd have to go
> > > through each of the child pid namespaces on pidns_level_2 which could be
> > > thousands. So you could only do this lazily. IOW, compare and possibly
> > > update the pid_max value of the child pid namespace everytime it's read
> > > or written. Maybe that .effective is the way to go; not sure right now.
>
> Hi Tycho!
>
> >
> > I wonder then, does it make sense to implement this as a cgroup thing
> > instead, which is used to doing this kind of traversal?
> >
> > Or I suppose not, since the idea is to get legacy software that's
> > writing to pid_max to work?
>
> Yes, this is mostly for legacy software that expects host-like
> behavior in the container.
> I know that folks who work on running Android inside the container are
> very-very interested in this.

My colleague, Simon Fels, shared with me:
https://android.googlesource.com/platform/bionic.git/+/refs/heads/main/docs/32-bit-abi.md#is-too-small-for-large-pids

>
> Kind regards,
> Alex
>
> >
> > Tycho

diff mbox series

Patch

diff --git a/tools/testing/selftests/pid_namespace/.gitignore b/tools/testing/selftests/pid_namespace/.gitignore
index 93ab9d7e5b7e..5118f0f3edf4 100644
--- a/tools/testing/selftests/pid_namespace/.gitignore
+++ b/tools/testing/selftests/pid_namespace/.gitignore
@@ -1 +1,2 @@ 
+pid_max
 regression_enomem
diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile
index 9286a1d22cd3..b972f55d07ae 100644
--- a/tools/testing/selftests/pid_namespace/Makefile
+++ b/tools/testing/selftests/pid_namespace/Makefile
@@ -1,7 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS += -g $(KHDR_INCLUDES)
 
-TEST_GEN_PROGS = regression_enomem
+TEST_GEN_PROGS = regression_enomem pid_max
 
 LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h
 
diff --git a/tools/testing/selftests/pid_namespace/pid_max.c b/tools/testing/selftests/pid_namespace/pid_max.c
new file mode 100644
index 000000000000..51c414faabb0
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/pid_max.c
@@ -0,0 +1,358 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+#define __STACK_SIZE (8 * 1024 * 1024)
+static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
+{
+	char *stack;
+	pid_t ret;
+
+	stack = malloc(__STACK_SIZE);
+	if (!stack)
+		return -ENOMEM;
+
+#ifdef __ia64__
+	ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg);
+#else
+	ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg);
+#endif
+	free(stack);
+	return ret;
+}
+
+static int pid_max_cb(void *data)
+{
+	int fd, ret;
+	pid_t pid;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return -1;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return -1;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return -1;
+	}
+
+	ret = write(fd, "500", sizeof("500") - 1);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return -1;
+	}
+
+	for (int i = 0; i < 501; i++) {
+		pid = fork();
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+		wait_for_pid(pid);
+		if (pid > 500) {
+			fprintf(stderr, "Managed to create pid number beyond limit\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int pid_max_nested_inner(void *data)
+{
+	int fret = -1;
+	pid_t pids[2];
+	int fd, i, ret;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return fret;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return fret;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return fret;
+	}
+
+	ret = write(fd, "500", sizeof("500") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return fret;
+	}
+
+	pids[0] = fork();
+	if (pids[0] < 0) {
+		fprintf(stderr, "Failed to create first new process\n");
+		return fret;
+	}
+
+	if (pids[0] == 0)
+		exit(EXIT_SUCCESS);
+
+	pids[1] = fork();
+	wait_for_pid(pids[0]);
+	if (pids[1] >= 0) {
+		if (pids[1] == 0)
+			exit(EXIT_SUCCESS);
+		wait_for_pid(pids[1]);
+
+		fprintf(stderr, "Managed to create process even though ancestor pid namespace had a limit\n");
+		return fret;
+	}
+
+	/* Now make sure that we wrap pids at 400. */
+	for (i = 0; i < 510; i++) {
+		pid_t pid;
+
+		pid = fork();
+		if (pid < 0)
+			return fret;
+
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+
+		wait_for_pid(pid);
+		if (pid >= 500) {
+			fprintf(stderr, "Managed to create process with pid %d beyond configured limit\n", pid);
+			return fret;
+		}
+	}
+
+	return 0;
+}
+
+static int pid_max_nested_outer(void *data)
+{
+	int fret = -1, nr_procs = 400;
+	pid_t pids[1000];
+	int fd, i, ret;
+	pid_t pid;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return fret;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return fret;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return fret;
+	}
+
+	ret = write(fd, "400", sizeof("400") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return fret;
+	}
+
+	/*
+	 * Create 397 processes. This leaves room for do_clone() (398) and
+	 * one more 399. So creating another process needs to fail.
+	 */
+	for (nr_procs = 0; nr_procs < 396; nr_procs++) {
+		pid = fork();
+		if (pid < 0)
+			goto reap;
+
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+
+		pids[nr_procs] = pid;
+	}
+
+	pid = do_clone(pid_max_nested_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	if (pid < 0) {
+		fprintf(stderr, "%m - Failed to clone nested pidns\n");
+		goto reap;
+	}
+
+	if (wait_for_pid(pid)) {
+		fprintf(stderr, "%m - Nested pid_max failed\n");
+		goto reap;
+	}
+
+	fret = 0;
+
+reap:
+	for (int i = 0; i < nr_procs; i++)
+		wait_for_pid(pids[i]);
+
+	return fret;
+}
+
+static int pid_max_nested_limit_inner(void *data)
+{
+	int fret = -1, nr_procs = 400;
+	int fd, ret;
+	pid_t pid;
+	pid_t pids[1000];
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return fret;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return fret;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return fret;
+	}
+
+	ret = write(fd, "500", sizeof("500") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return fret;
+	}
+
+	for (nr_procs = 0; nr_procs < 500; nr_procs++) {
+		pid = fork();
+		if (pid < 0)
+			break;
+
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+
+		pids[nr_procs] = pid;
+	}
+
+	if (nr_procs >= 400) {
+		fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
+		goto reap;
+	}
+
+	fret = 0;
+
+reap:
+	for (int i = 0; i < nr_procs; i++)
+		wait_for_pid(pids[i]);
+
+	return fret;
+}
+
+static int pid_max_nested_limit_outer(void *data)
+{
+	int fd, ret;
+	pid_t pid;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return -1;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return -1;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return -1;
+	}
+
+	ret = write(fd, "400", sizeof("400") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return -1;
+	}
+
+	pid = do_clone(pid_max_nested_limit_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	if (pid < 0) {
+		fprintf(stderr, "%m - Failed to clone nested pidns\n");
+		return -1;
+	}
+
+	if (wait_for_pid(pid)) {
+		fprintf(stderr, "%m - Nested pid_max failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+TEST(pid_max_simple)
+{
+	pid_t pid;
+
+
+	pid = do_clone(pid_max_cb, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	ASSERT_GT(pid, 0);
+	ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST(pid_max_nested_limit)
+{
+	pid_t pid;
+
+	pid = do_clone(pid_max_nested_limit_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	ASSERT_GT(pid, 0);
+	ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST(pid_max_nested)
+{
+	pid_t pid;
+
+	pid = do_clone(pid_max_nested_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	ASSERT_GT(pid, 0);
+	ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST_HARNESS_MAIN