[v2,4/5] perf kvm: Support sampling guest callchains

Message ID SY4P282MB108433024762F1F292D47C2A9DCFA@SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM
State New
Headers
Series perf: KVM: Enable callchains for guests |

Commit Message

Tianyi Liu Oct. 8, 2023, 2:57 p.m. UTC
  This patch provides support for sampling guests' callchains.

The signature of `get_perf_callchain` has been modified to explicitly
specify whether it needs to sample the host or guest callchain.
Based on the context, it will distribute the sampling request to one of
`perf_callchain_user`, `perf_callchain_kernel`, or `perf_callchain_guest`.

The reason for separately implementing `perf_callchain_user` and
`perf_callchain_kernel` is that the kernel may utilize special unwinders
such as `ORC`. However, for the guest, we only support stackframe-based
unwinding, so the implementation is generic and only needs to be
separately implemented for 32-bit and 64-bit.

Signed-off-by: Tianyi Liu <i.pear@outlook.com>
---
 arch/x86/events/core.c     | 56 +++++++++++++++++++++++++++++++-------
 include/linux/perf_event.h |  3 +-
 kernel/bpf/stackmap.c      |  8 +++---
 kernel/events/callchain.c  | 27 +++++++++++++++++-
 kernel/events/core.c       |  7 ++++-
 5 files changed, 84 insertions(+), 17 deletions(-)
  

Comments

kernel test robot Oct. 8, 2023, 7:57 p.m. UTC | #1
Hi Tianyi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 8a749fd1a8720d4619c91c8b6e7528c0a355c0aa]

url:    https://github.com/intel-lab-lkp/linux/commits/Tianyi-Liu/KVM-Add-arch-specific-interfaces-for-sampling-guest-callchains/20231008-230042
base:   8a749fd1a8720d4619c91c8b6e7528c0a355c0aa
patch link:    https://lore.kernel.org/r/SY4P282MB108433024762F1F292D47C2A9DCFA%40SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM
patch subject: [PATCH v2 4/5] perf kvm: Support sampling guest callchains
config: i386-tinyconfig (https://download.01.org/0day-ci/archive/20231009/202310090338.4PmYjmBS-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231009/202310090338.4PmYjmBS-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310090338.4PmYjmBS-lkp@intel.com/

All warnings (new ones prefixed by >>):

   arch/x86/events/core.c: In function 'perf_callchain_guest32':
>> arch/x86/events/core.c:2784:43: warning: passing argument 1 of 'perf_guest_read_virt' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
    2784 |                 if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
         |                                           ^~~~~~~~~~~~~~~
   In file included from arch/x86/events/core.c:15:
   include/linux/perf_event.h:1531:41: note: expected 'void *' but argument is of type 'const u32 *' {aka 'const unsigned int *'}
    1531 | static inline bool perf_guest_read_virt(void*, void*, unsigned int)     { return 0; }
         |                                         ^~~~~
   arch/x86/events/core.c:2787:43: warning: passing argument 1 of 'perf_guest_read_virt' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
    2787 |                 if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
         |                                           ^~~~~~~~~~~~~~~~~~~
   include/linux/perf_event.h:1531:41: note: expected 'void *' but argument is of type 'const u32 *' {aka 'const unsigned int *'}
    1531 | static inline bool perf_guest_read_virt(void*, void*, unsigned int)     { return 0; }
         |                                         ^~~~~
   arch/x86/events/core.c: In function 'perf_callchain_guest':
   arch/x86/events/core.c:2808:51: warning: passing argument 1 of 'perf_guest_read_virt' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
    2808 |                         if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
         |                                                   ^~~~~~~~~~~~~~~
   include/linux/perf_event.h:1531:41: note: expected 'void *' but argument is of type 'struct stack_frame * const*'
    1531 | static inline bool perf_guest_read_virt(void*, void*, unsigned int)     { return 0; }
         |                                         ^~~~~
   arch/x86/events/core.c:2811:51: warning: passing argument 1 of 'perf_guest_read_virt' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
    2811 |                         if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
         |                                                   ^~~~~~~~~~~~~~~~~~~
   include/linux/perf_event.h:1531:41: note: expected 'void *' but argument is of type 'const long unsigned int *'
    1531 | static inline bool perf_guest_read_virt(void*, void*, unsigned int)     { return 0; }
         |                                         ^~~~~


vim +2784 arch/x86/events/core.c

  2775	
  2776	static inline void
  2777	perf_callchain_guest32(struct perf_callchain_entry_ctx *entry)
  2778	{
  2779		struct stack_frame_ia32 frame;
  2780		const struct stack_frame_ia32 *fp;
  2781	
  2782		fp = (void *)perf_guest_get_frame_pointer();
  2783		while (fp && entry->nr < entry->max_stack) {
> 2784			if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
  2785				sizeof(frame.next_frame)))
  2786				break;
  2787			if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
  2788				sizeof(frame.return_address)))
  2789				break;
  2790			perf_callchain_store(entry, frame.return_address);
  2791			fp = (void *)frame.next_frame;
  2792		}
  2793	}
  2794
  
Maxim Levitsky Oct. 10, 2023, 4:12 p.m. UTC | #2
У нд, 2023-10-08 у 22:57 +0800, Tianyi Liu пише:
> This patch provides support for sampling guests' callchains.
> 
> The signature of `get_perf_callchain` has been modified to explicitly
> specify whether it needs to sample the host or guest callchain.
> Based on the context, it will distribute the sampling request to one of
> `perf_callchain_user`, `perf_callchain_kernel`, or `perf_callchain_guest`.
> 
> The reason for separately implementing `perf_callchain_user` and
> `perf_callchain_kernel` is that the kernel may utilize special unwinders
> such as `ORC`. However, for the guest, we only support stackframe-based
> unwinding, so the implementation is generic and only needs to be
> separately implemented for 32-bit and 64-bit.
> 
> Signed-off-by: Tianyi Liu <i.pear@outlook.com>
> ---
>  arch/x86/events/core.c     | 56 +++++++++++++++++++++++++++++++-------
>  include/linux/perf_event.h |  3 +-
>  kernel/bpf/stackmap.c      |  8 +++---
>  kernel/events/callchain.c  | 27 +++++++++++++++++-
>  kernel/events/core.c       |  7 ++++-
>  5 files changed, 84 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 185f902e5..ea4c86175 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2758,11 +2758,6 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
>  	struct unwind_state state;
>  	unsigned long addr;
>  
> -	if (perf_guest_state()) {
> -		/* TODO: We don't support guest os callchain now */
> -		return;
> -	}
> -
>  	if (perf_callchain_store(entry, regs->ip))
>  		return;
>  
> @@ -2778,6 +2773,52 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
>  	}
>  }
>  
> +static inline void
> +perf_callchain_guest32(struct perf_callchain_entry_ctx *entry)
> +{
> +	struct stack_frame_ia32 frame;
> +	const struct stack_frame_ia32 *fp;
> +
> +	fp = (void *)perf_guest_get_frame_pointer();
> +	while (fp && entry->nr < entry->max_stack) {
> +		if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
This should be fp->next_frame.
> +			sizeof(frame.next_frame)))
> +			break;
> +		if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
Same here.
> +			sizeof(frame.return_address)))
> +			break;
> +		perf_callchain_store(entry, frame.return_address);
> +		fp = (void *)frame.next_frame;
> +	}
> +}
> +
> +void
> +perf_callchain_guest(struct perf_callchain_entry_ctx *entry)
> +{
> +	struct stack_frame frame;
> +	const struct stack_frame *fp;
> +	unsigned int guest_state;
> +
> +	guest_state = perf_guest_state();
> +	perf_callchain_store(entry, perf_guest_get_ip());
> +
> +	if (guest_state & PERF_GUEST_64BIT) {
> +		fp = (void *)perf_guest_get_frame_pointer();
> +		while (fp && entry->nr < entry->max_stack) {
> +			if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
Same here.
> +				sizeof(frame.next_frame)))
> +				break;
> +			if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
And here.

> +				sizeof(frame.return_address)))
> +				break;
> +			perf_callchain_store(entry, frame.return_address);
> +			fp = (void *)frame.next_frame;
> +		}
> +	} else {
> +		perf_callchain_guest32(entry);
> +	}
> +}

For symmetry, maybe it makes sense to have perf_callchain_guest32 and perf_callchain_guest64
and then make perf_callchain_guest call each? No strong opinion on this of course.


> +
>  static inline int
>  valid_user_frame(const void __user *fp, unsigned long size)
>  {
> @@ -2861,11 +2902,6 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
>  	struct stack_frame frame;
>  	const struct stack_frame __user *fp;
>  
> -	if (perf_guest_state()) {
> -		/* TODO: We don't support guest os callchain now */
> -		return;
> -	}
> -
>  	/*
>  	 * We don't know what to do with VM86 stacks.. ignore them for now.
>  	 */
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d0f937a62..a2baf4856 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1545,9 +1545,10 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
>  
>  extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
>  extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
> +extern void perf_callchain_guest(struct perf_callchain_entry_ctx *entry);
>  extern struct perf_callchain_entry *
>  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> -		   u32 max_stack, bool crosstask, bool add_mark);
> +		   bool host, bool guest, u32 max_stack, bool crosstask, bool add_mark);
>  extern int get_callchain_buffers(int max_stack);
>  extern void put_callchain_buffers(void);
>  extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 458bb80b1..2e88d4639 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -294,8 +294,8 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>  	if (max_depth > sysctl_perf_event_max_stack)
>  		max_depth = sysctl_perf_event_max_stack;
>  
> -	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
> -				   false, false);
> +	trace = get_perf_callchain(regs, 0, kernel, user, true, false,
> +				   max_depth, false, false);
>  
>  	if (unlikely(!trace))
>  		/* couldn't fetch the stack trace */
> @@ -420,8 +420,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>  	else if (kernel && task)
>  		trace = get_callchain_entry_for_task(task, max_depth);
>  	else
> -		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
> -					   false, false);
> +		trace = get_perf_callchain(regs, 0, kernel, user, true, false,
> +					   max_depth, false, false);
>  	if (unlikely(!trace))
>  		goto err_fault;
>  
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 1273be843..7e80729e9 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -45,6 +45,10 @@ __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
>  {
>  }
>  
> +__weak void perf_callchain_guest(struct perf_callchain_entry_ctx *entry)
> +{
> +}
> +
>  static void release_callchain_buffers_rcu(struct rcu_head *head)
>  {
>  	struct callchain_cpus_entries *entries;
> @@ -178,11 +182,12 @@ put_callchain_entry(int rctx)
>  
>  struct perf_callchain_entry *
>  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> -		   u32 max_stack, bool crosstask, bool add_mark)
> +		   bool host, bool guest, u32 max_stack, bool crosstask, bool add_mark)
>  {
>  	struct perf_callchain_entry *entry;
>  	struct perf_callchain_entry_ctx ctx;
>  	int rctx;
> +	unsigned int guest_state;
>  
>  	entry = get_callchain_entry(&rctx);
>  	if (!entry)
> @@ -194,6 +199,26 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
>  	ctx.contexts       = 0;
>  	ctx.contexts_maxed = false;
>  
> +	guest_state = perf_guest_state();
> +	if (guest_state) {
> +		if (!guest)
> +			goto exit_put;
> +		if (user && (guest_state & PERF_GUEST_USER)) {
> +			if (add_mark)
> +				perf_callchain_store_context(&ctx, PERF_CONTEXT_GUEST_USER);
> +			perf_callchain_guest(&ctx);
> +		}
> +		if (kernel && !(guest_state & PERF_GUEST_USER)) {
> +			if (add_mark)
> +				perf_callchain_store_context(&ctx, PERF_CONTEXT_GUEST_KERNEL);
> +			perf_callchain_guest(&ctx);
> +		}
> +		goto exit_put;
> +	}
> +
> +	if (unlikely(!host))
> +		goto exit_put;
> +
>  	if (kernel && !user_mode(regs)) {
>  		if (add_mark)
>  			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index eaba00ec2..b3401f403 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7559,6 +7559,8 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
>  {
>  	bool kernel = !event->attr.exclude_callchain_kernel;
>  	bool user   = !event->attr.exclude_callchain_user;
> +	bool host   = !event->attr.exclude_host;
> +	bool guest  = !event->attr.exclude_guest;
>  	/* Disallow cross-task user callchains. */
>  	bool crosstask = event->ctx->task && event->ctx->task != current;
>  	const u32 max_stack = event->attr.sample_max_stack;
> @@ -7567,7 +7569,10 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
>  	if (!kernel && !user)
>  		return &__empty_callchain;
>  
> -	callchain = get_perf_callchain(regs, 0, kernel, user,
> +	if (!host && !guest)
> +		return &__empty_callchain;
> +
> +	callchain = get_perf_callchain(regs, 0, kernel, user, host, guest,
>  				       max_stack, crosstask, true);
>  	return callchain ?: &__empty_callchain;
>  }


Best regards,
	Maxim Levitsky
  
Tianyi Liu Oct. 11, 2023, 2:44 p.m. UTC | #3
Hi Maxim,

At 2023-10-10 16:12 +0000, Maxim Levitsky wrote:
> > +static inline void
> > +perf_callchain_guest32(struct perf_callchain_entry_ctx *entry)
> > +{
> > +	struct stack_frame_ia32 frame;
> > +	const struct stack_frame_ia32 *fp;
> > +
> > +	fp = (void *)perf_guest_get_frame_pointer();
> > +	while (fp && entry->nr < entry->max_stack) {
> > +		if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
> This should be fp->next_frame.
> > +			sizeof(frame.next_frame)))
> > +			break;
> > +		if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
> Same here.
> > +			sizeof(frame.return_address)))
> > +			break;
> > +		perf_callchain_store(entry, frame.return_address);
> > +		fp = (void *)frame.next_frame;
> > +	}
> > +}
> > +

The address space where `fp` resides here is in the guest memory, not in
the directly accessible kernel address space. `&fp->next_frame` and
`&fp->return_address` are simply calculating address offsets in a more
readable manner, much like `fp + 0` and `fp + 4`.

The original implementation of `perf_callchain_user` and
`perf_callchain_user32` also use this approach [1].

>
> For symmetry, maybe it makes sense to have perf_callchain_guest32 and perf_callchain_guest64
> and then make perf_callchain_guest call each? No strong opinion on this of course.
>

The `perf_callchain_guest` and `perf_callchain_guest32` here are simply
designed to mimic `perf_callchain_user` and `perf_callchain_user32` [2].
I'm also open to make the logic fully separate, if this doesn't seem
elegant enough.

[1] https://github.com/torvalds/linux/blob/master/arch/x86/events/core.c#L2890
[2] https://github.com/torvalds/linux/blob/master/arch/x86/events/core.c#L2820


Best regards,
Tianyi Liu
  
kernel test robot Oct. 12, 2023, 8:41 p.m. UTC | #4
Hi Tianyi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 8a749fd1a8720d4619c91c8b6e7528c0a355c0aa]

url:    https://github.com/intel-lab-lkp/linux/commits/Tianyi-Liu/KVM-Add-arch-specific-interfaces-for-sampling-guest-callchains/20231008-230042
base:   8a749fd1a8720d4619c91c8b6e7528c0a355c0aa
patch link:    https://lore.kernel.org/r/SY4P282MB108433024762F1F292D47C2A9DCFA%40SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM
patch subject: [PATCH v2 4/5] perf kvm: Support sampling guest callchains
config: i386-randconfig-061-20231012 (https://download.01.org/0day-ci/archive/20231013/202310130419.cIkNaYZm-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231013/202310130419.cIkNaYZm-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310130419.cIkNaYZm-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> arch/x86/events/core.c:2808:52: sparse: sparse: incorrect type in argument 1 (different modifiers) @@     expected void *addr @@     got struct stack_frame *const * @@
   arch/x86/events/core.c:2808:52: sparse:     expected void *addr
   arch/x86/events/core.c:2808:52: sparse:     got struct stack_frame *const *
>> arch/x86/events/core.c:2811:52: sparse: sparse: incorrect type in argument 1 (different modifiers) @@     expected void *addr @@     got unsigned long const * @@
   arch/x86/events/core.c:2811:52: sparse:     expected void *addr
   arch/x86/events/core.c:2811:52: sparse:     got unsigned long const *
>> arch/x86/events/core.c:2784:44: sparse: sparse: incorrect type in argument 1 (different modifiers) @@     expected void *addr @@     got unsigned int const * @@
   arch/x86/events/core.c:2784:44: sparse:     expected void *addr
   arch/x86/events/core.c:2784:44: sparse:     got unsigned int const *
   arch/x86/events/core.c:2787:44: sparse: sparse: incorrect type in argument 1 (different modifiers) @@     expected void *addr @@     got unsigned int const * @@
   arch/x86/events/core.c:2787:44: sparse:     expected void *addr
   arch/x86/events/core.c:2787:44: sparse:     got unsigned int const *

vim +2808 arch/x86/events/core.c

  2775	
  2776	static inline void
  2777	perf_callchain_guest32(struct perf_callchain_entry_ctx *entry)
  2778	{
  2779		struct stack_frame_ia32 frame;
  2780		const struct stack_frame_ia32 *fp;
  2781	
  2782		fp = (void *)perf_guest_get_frame_pointer();
  2783		while (fp && entry->nr < entry->max_stack) {
> 2784			if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
  2785				sizeof(frame.next_frame)))
  2786				break;
  2787			if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
  2788				sizeof(frame.return_address)))
  2789				break;
  2790			perf_callchain_store(entry, frame.return_address);
  2791			fp = (void *)frame.next_frame;
  2792		}
  2793	}
  2794	
  2795	void
  2796	perf_callchain_guest(struct perf_callchain_entry_ctx *entry)
  2797	{
  2798		struct stack_frame frame;
  2799		const struct stack_frame *fp;
  2800		unsigned int guest_state;
  2801	
  2802		guest_state = perf_guest_state();
  2803		perf_callchain_store(entry, perf_guest_get_ip());
  2804	
  2805		if (guest_state & PERF_GUEST_64BIT) {
  2806			fp = (void *)perf_guest_get_frame_pointer();
  2807			while (fp && entry->nr < entry->max_stack) {
> 2808				if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
  2809					sizeof(frame.next_frame)))
  2810					break;
> 2811				if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
  2812					sizeof(frame.return_address)))
  2813					break;
  2814				perf_callchain_store(entry, frame.return_address);
  2815				fp = (void *)frame.next_frame;
  2816			}
  2817		} else {
  2818			perf_callchain_guest32(entry);
  2819		}
  2820	}
  2821
  

Patch

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 185f902e5..ea4c86175 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2758,11 +2758,6 @@  perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 	struct unwind_state state;
 	unsigned long addr;
 
-	if (perf_guest_state()) {
-		/* TODO: We don't support guest os callchain now */
-		return;
-	}
-
 	if (perf_callchain_store(entry, regs->ip))
 		return;
 
@@ -2778,6 +2773,52 @@  perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 	}
 }
 
+static inline void
+perf_callchain_guest32(struct perf_callchain_entry_ctx *entry)
+{
+	struct stack_frame_ia32 frame;
+	const struct stack_frame_ia32 *fp;
+
+	fp = (void *)perf_guest_get_frame_pointer();
+	while (fp && entry->nr < entry->max_stack) {
+		if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
+			sizeof(frame.next_frame)))
+			break;
+		if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
+			sizeof(frame.return_address)))
+			break;
+		perf_callchain_store(entry, frame.return_address);
+		fp = (void *)frame.next_frame;
+	}
+}
+
+void
+perf_callchain_guest(struct perf_callchain_entry_ctx *entry)
+{
+	struct stack_frame frame;
+	const struct stack_frame *fp;
+	unsigned int guest_state;
+
+	guest_state = perf_guest_state();
+	perf_callchain_store(entry, perf_guest_get_ip());
+
+	if (guest_state & PERF_GUEST_64BIT) {
+		fp = (void *)perf_guest_get_frame_pointer();
+		while (fp && entry->nr < entry->max_stack) {
+			if (!perf_guest_read_virt(&fp->next_frame, &frame.next_frame,
+				sizeof(frame.next_frame)))
+				break;
+			if (!perf_guest_read_virt(&fp->return_address, &frame.return_address,
+				sizeof(frame.return_address)))
+				break;
+			perf_callchain_store(entry, frame.return_address);
+			fp = (void *)frame.next_frame;
+		}
+	} else {
+		perf_callchain_guest32(entry);
+	}
+}
+
 static inline int
 valid_user_frame(const void __user *fp, unsigned long size)
 {
@@ -2861,11 +2902,6 @@  perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 	struct stack_frame frame;
 	const struct stack_frame __user *fp;
 
-	if (perf_guest_state()) {
-		/* TODO: We don't support guest os callchain now */
-		return;
-	}
-
 	/*
 	 * We don't know what to do with VM86 stacks.. ignore them for now.
 	 */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d0f937a62..a2baf4856 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1545,9 +1545,10 @@  DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
 extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_guest(struct perf_callchain_entry_ctx *entry);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark);
+		   bool host, bool guest, u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 458bb80b1..2e88d4639 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -294,8 +294,8 @@  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	if (max_depth > sysctl_perf_event_max_stack)
 		max_depth = sysctl_perf_event_max_stack;
 
-	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
-				   false, false);
+	trace = get_perf_callchain(regs, 0, kernel, user, true, false,
+				   max_depth, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -420,8 +420,8 @@  static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 	else if (kernel && task)
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
-		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
-					   false, false);
+		trace = get_perf_callchain(regs, 0, kernel, user, true, false,
+					   max_depth, false, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1273be843..7e80729e9 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -45,6 +45,10 @@  __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 {
 }
 
+__weak void perf_callchain_guest(struct perf_callchain_entry_ctx *entry)
+{
+}
+
 static void release_callchain_buffers_rcu(struct rcu_head *head)
 {
 	struct callchain_cpus_entries *entries;
@@ -178,11 +182,12 @@  put_callchain_entry(int rctx)
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark)
+		   bool host, bool guest, u32 max_stack, bool crosstask, bool add_mark)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
 	int rctx;
+	unsigned int guest_state;
 
 	entry = get_callchain_entry(&rctx);
 	if (!entry)
@@ -194,6 +199,26 @@  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	ctx.contexts       = 0;
 	ctx.contexts_maxed = false;
 
+	guest_state = perf_guest_state();
+	if (guest_state) {
+		if (!guest)
+			goto exit_put;
+		if (user && (guest_state & PERF_GUEST_USER)) {
+			if (add_mark)
+				perf_callchain_store_context(&ctx, PERF_CONTEXT_GUEST_USER);
+			perf_callchain_guest(&ctx);
+		}
+		if (kernel && !(guest_state & PERF_GUEST_USER)) {
+			if (add_mark)
+				perf_callchain_store_context(&ctx, PERF_CONTEXT_GUEST_KERNEL);
+			perf_callchain_guest(&ctx);
+		}
+		goto exit_put;
+	}
+
+	if (unlikely(!host))
+		goto exit_put;
+
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eaba00ec2..b3401f403 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7559,6 +7559,8 @@  perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
 	bool kernel = !event->attr.exclude_callchain_kernel;
 	bool user   = !event->attr.exclude_callchain_user;
+	bool host   = !event->attr.exclude_host;
+	bool guest  = !event->attr.exclude_guest;
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
 	const u32 max_stack = event->attr.sample_max_stack;
@@ -7567,7 +7569,10 @@  perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return &__empty_callchain;
 
-	callchain = get_perf_callchain(regs, 0, kernel, user,
+	if (!host && !guest)
+		return &__empty_callchain;
+
+	callchain = get_perf_callchain(regs, 0, kernel, user, host, guest,
 				       max_stack, crosstask, true);
 	return callchain ?: &__empty_callchain;
 }