[v2] ipc/mqueue: fix potential sleeping issue in mqueue_flush_file

Message ID 20231220021208.2634523-1-shaozhengchao@huawei.com
State New
Headers
Series [v2] ipc/mqueue: fix potential sleeping issue in mqueue_flush_file |

Commit Message

shaozhengchao Dec. 20, 2023, 2:12 a.m. UTC
  I analyze the potential sleeping issue of the following processes:
Thread A                                Thread B
...                                     netlink_create  //ref = 1
do_mq_notify                            ...
  sock = netlink_getsockbyfilp          ...     //ref = 2
  info->notify_sock = sock;             ...
...                                     netlink_sendmsg
...                                       skb = netlink_alloc_large_skb  //skb->head is vmalloced
...                                       netlink_unicast
...                                         sk = netlink_getsockbyportid //ref = 3
...                                         netlink_sendskb
...                                           __netlink_sendskb
...                                             skb_queue_tail //put skb to sk_receive_queue
...                                         sock_put //ref = 2
...                                     ...
...                                     netlink_release
...                                       deferred_put_nlk_sk //ref = 1
mqueue_flush_file
  spin_lock
  remove_notification
    netlink_sendskb
      sock_put  //ref = 0
        sk_free
          ...
          __sk_destruct
            netlink_sock_destruct
              skb_queue_purge  //get skb from sk_receive_queue
                ...
                __skb_queue_purge_reason
                  kfree_skb_reason
                    __kfree_skb
                    ...
                    skb_release_all
                      skb_release_head_state
                        netlink_skb_destructor
                          vfree(skb->head)  //sleeping while holding spinlock

In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
When the mqueue executes flush, the sleeping bug will occur. Use mutex
lock instead of spin lock in mqueue_flush_file.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
---
v2: CCed some networking maintainer & netdev list
---
 ipc/mqueue.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)
  

Comments

shaozhengchao Jan. 2, 2024, 1:09 a.m. UTC | #1
+ping

Does anyone have ideas with this patch?

On 2023/12/20 10:12, Zhengchao Shao wrote:
> I analyze the potential sleeping issue of the following processes:
> Thread A                                Thread B
> ...                                     netlink_create  //ref = 1
> do_mq_notify                            ...
>    sock = netlink_getsockbyfilp          ...     //ref = 2
>    info->notify_sock = sock;             ...
> ...                                     netlink_sendmsg
> ...                                       skb = netlink_alloc_large_skb  //skb->head is vmalloced
> ...                                       netlink_unicast
> ...                                         sk = netlink_getsockbyportid //ref = 3
> ...                                         netlink_sendskb
> ...                                           __netlink_sendskb
> ...                                             skb_queue_tail //put skb to sk_receive_queue
> ...                                         sock_put //ref = 2
> ...                                     ...
> ...                                     netlink_release
> ...                                       deferred_put_nlk_sk //ref = 1
> mqueue_flush_file
>    spin_lock
>    remove_notification
>      netlink_sendskb
>        sock_put  //ref = 0
>          sk_free
>            ...
>            __sk_destruct
>              netlink_sock_destruct
>                skb_queue_purge  //get skb from sk_receive_queue
>                  ...
>                  __skb_queue_purge_reason
>                    kfree_skb_reason
>                      __kfree_skb
>                      ...
>                      skb_release_all
>                        skb_release_head_state
>                          netlink_skb_destructor
>                            vfree(skb->head)  //sleeping while holding spinlock
> 
> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
> When the mqueue executes flush, the sleeping bug will occur. Use mutex
> lock instead of spin lock in mqueue_flush_file.
> 
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
> ---
> v2: CCed some networking maintainer & netdev list
> ---
>   ipc/mqueue.c | 48 ++++++++++++++++++++++++------------------------
>   1 file changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 5eea4dc0509e..f6f92e3f82e4 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -118,9 +118,9 @@ struct posix_msg_tree_node {
>    * Solution: use _release and _acquire barriers.
>    *
>    * 3) There is intentionally no barrier when setting current->state
> - *    to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
> + *    to TASK_INTERRUPTIBLE: mutex_unlock(&info->lock) provides the
>    *    release memory barrier, and the wakeup is triggered when holding
> - *    info->lock, i.e. spin_lock(&info->lock) provided a pairing
> + *    info->lock, i.e. mutex_lock(&info->lock) provided a pairing
>    *    acquire memory barrier.
>    */
>   
> @@ -132,7 +132,7 @@ struct ext_wait_queue {		/* queue of sleeping tasks */
>   };
>   
>   struct mqueue_inode_info {
> -	spinlock_t lock;
> +	struct mutex lock;
>   	struct inode vfs_inode;
>   	wait_queue_head_t wait_q;
>   
> @@ -312,7 +312,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
>   		inode->i_size = FILENT_SIZE;
>   		/* mqueue specific info */
>   		info = MQUEUE_I(inode);
> -		spin_lock_init(&info->lock);
> +		mutex_init(&info->lock);
>   		init_waitqueue_head(&info->wait_q);
>   		INIT_LIST_HEAD(&info->e_wait_q[0].list);
>   		INIT_LIST_HEAD(&info->e_wait_q[1].list);
> @@ -523,11 +523,11 @@ static void mqueue_evict_inode(struct inode *inode)
>   
>   	ipc_ns = get_ns_from_inode(inode);
>   	info = MQUEUE_I(inode);
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   	while ((msg = msg_get(info)) != NULL)
>   		list_add_tail(&msg->m_list, &tmp_msg);
>   	kfree(info->node_cache);
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   
>   	list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
>   		list_del(&msg->m_list);
> @@ -640,7 +640,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
>   	char buffer[FILENT_SIZE];
>   	ssize_t ret;
>   
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   	snprintf(buffer, sizeof(buffer),
>   			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
>   			info->qsize,
> @@ -649,7 +649,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
>   			 info->notify.sigev_notify == SIGEV_SIGNAL) ?
>   				info->notify.sigev_signo : 0,
>   			pid_vnr(info->notify_owner));
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   	buffer[sizeof(buffer)-1] = '\0';
>   
>   	ret = simple_read_from_buffer(u_data, count, off, buffer,
> @@ -665,11 +665,11 @@ static int mqueue_flush_file(struct file *filp, fl_owner_t id)
>   {
>   	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
>   
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   	if (task_tgid(current) == info->notify_owner)
>   		remove_notification(info);
>   
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   	return 0;
>   }
>   
> @@ -680,13 +680,13 @@ static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *po
>   
>   	poll_wait(filp, &info->wait_q, poll_tab);
>   
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   	if (info->attr.mq_curmsgs)
>   		retval = EPOLLIN | EPOLLRDNORM;
>   
>   	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
>   		retval |= EPOLLOUT | EPOLLWRNORM;
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   
>   	return retval;
>   }
> @@ -724,7 +724,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
>   		/* memory barrier not required, we hold info->lock */
>   		__set_current_state(TASK_INTERRUPTIBLE);
>   
> -		spin_unlock(&info->lock);
> +		mutex_unlock(&info->lock);
>   		time = schedule_hrtimeout_range_clock(timeout, 0,
>   			HRTIMER_MODE_ABS, CLOCK_REALTIME);
>   
> @@ -734,7 +734,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
>   			retval = 0;
>   			goto out;
>   		}
> -		spin_lock(&info->lock);
> +		mutex_lock(&info->lock);
>   
>   		/* we hold info->lock, so no memory barrier required */
>   		if (READ_ONCE(ewp->state) == STATE_READY) {
> @@ -752,7 +752,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
>   	}
>   	list_del(&ewp->list);
>   out_unlock:
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   out:
>   	return retval;
>   }
> @@ -1125,7 +1125,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
>   	if (!info->node_cache)
>   		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>   
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   
>   	if (!info->node_cache && new_leaf) {
>   		/* Save our speculative allocation into the cache */
> @@ -1166,7 +1166,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
>   		simple_inode_init_ts(inode);
>   	}
>   out_unlock:
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   	wake_up_q(&wake_q);
>   out_free:
>   	if (ret)
> @@ -1230,7 +1230,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
>   	if (!info->node_cache)
>   		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>   
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   
>   	if (!info->node_cache && new_leaf) {
>   		/* Save our speculative allocation into the cache */
> @@ -1242,7 +1242,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
>   
>   	if (info->attr.mq_curmsgs == 0) {
>   		if (f.file->f_flags & O_NONBLOCK) {
> -			spin_unlock(&info->lock);
> +			mutex_unlock(&info->lock);
>   			ret = -EAGAIN;
>   		} else {
>   			wait.task = current;
> @@ -1261,7 +1261,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
>   
>   		/* There is now free space in queue. */
>   		pipelined_receive(&wake_q, info);
> -		spin_unlock(&info->lock);
> +		mutex_unlock(&info->lock);
>   		wake_up_q(&wake_q);
>   		ret = 0;
>   	}
> @@ -1391,7 +1391,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
>   	info = MQUEUE_I(inode);
>   
>   	ret = 0;
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   	if (notification == NULL) {
>   		if (info->notify_owner == task_tgid(current)) {
>   			remove_notification(info);
> @@ -1424,7 +1424,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
>   		info->notify_user_ns = get_user_ns(current_user_ns());
>   		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
>   	}
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   out_fput:
>   	fdput(f);
>   out:
> @@ -1470,7 +1470,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
>   	inode = file_inode(f.file);
>   	info = MQUEUE_I(inode);
>   
> -	spin_lock(&info->lock);
> +	mutex_lock(&info->lock);
>   
>   	if (old) {
>   		*old = info->attr;
> @@ -1488,7 +1488,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
>   		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
>   	}
>   
> -	spin_unlock(&info->lock);
> +	mutex_unlock(&info->lock);
>   	fdput(f);
>   	return 0;
>   }
  
shaozhengchao Jan. 18, 2024, 1:20 a.m. UTC | #2
++ping

Does anyone have ideas with this patch?

On 2024/1/2 9:09, shaozhengchao wrote:
> +ping
> 
> Does anyone have ideas with this patch?
> 
> On 2023/12/20 10:12, Zhengchao Shao wrote:
>> I analyze the potential sleeping issue of the following processes:
>> Thread A                                Thread B
>> ...                                     netlink_create  //ref = 1
>> do_mq_notify                            ...
>>    sock = netlink_getsockbyfilp          ...     //ref = 2
>>    info->notify_sock = sock;             ...
>> ...                                     netlink_sendmsg
>> ...                                       skb = 
>> netlink_alloc_large_skb  //skb->head is vmalloced
>> ...                                       netlink_unicast
>> ...                                         sk = 
>> netlink_getsockbyportid //ref = 3
>> ...                                         netlink_sendskb
>> ...                                           __netlink_sendskb
>> ...                                             skb_queue_tail //put 
>> skb to sk_receive_queue
>> ...                                         sock_put //ref = 2
>> ...                                     ...
>> ...                                     netlink_release
>> ...                                       deferred_put_nlk_sk //ref = 1
>> mqueue_flush_file
>>    spin_lock
>>    remove_notification
>>      netlink_sendskb
>>        sock_put  //ref = 0
>>          sk_free
>>            ...
>>            __sk_destruct
>>              netlink_sock_destruct
>>                skb_queue_purge  //get skb from sk_receive_queue
>>                  ...
>>                  __skb_queue_purge_reason
>>                    kfree_skb_reason
>>                      __kfree_skb
>>                      ...
>>                      skb_release_all
>>                        skb_release_head_state
>>                          netlink_skb_destructor
>>                            vfree(skb->head)  //sleeping while holding 
>> spinlock
>>
>> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
>> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
>> When the mqueue executes flush, the sleeping bug will occur. Use mutex
>> lock instead of spin lock in mqueue_flush_file.
>>
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>> Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
>> ---
>> v2: CCed some networking maintainer & netdev list
>> ---
>>   ipc/mqueue.c | 48 ++++++++++++++++++++++++------------------------
>>   1 file changed, 24 insertions(+), 24 deletions(-)
>>
>> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
>> index 5eea4dc0509e..f6f92e3f82e4 100644
>> --- a/ipc/mqueue.c
>> +++ b/ipc/mqueue.c
>> @@ -118,9 +118,9 @@ struct posix_msg_tree_node {
>>    * Solution: use _release and _acquire barriers.
>>    *
>>    * 3) There is intentionally no barrier when setting current->state
>> - *    to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
>> + *    to TASK_INTERRUPTIBLE: mutex_unlock(&info->lock) provides the
>>    *    release memory barrier, and the wakeup is triggered when holding
>> - *    info->lock, i.e. spin_lock(&info->lock) provided a pairing
>> + *    info->lock, i.e. mutex_lock(&info->lock) provided a pairing
>>    *    acquire memory barrier.
>>    */
>> @@ -132,7 +132,7 @@ struct ext_wait_queue {        /* queue of 
>> sleeping tasks */
>>   };
>>   struct mqueue_inode_info {
>> -    spinlock_t lock;
>> +    struct mutex lock;
>>       struct inode vfs_inode;
>>       wait_queue_head_t wait_q;
>> @@ -312,7 +312,7 @@ static struct inode *mqueue_get_inode(struct 
>> super_block *sb,
>>           inode->i_size = FILENT_SIZE;
>>           /* mqueue specific info */
>>           info = MQUEUE_I(inode);
>> -        spin_lock_init(&info->lock);
>> +        mutex_init(&info->lock);
>>           init_waitqueue_head(&info->wait_q);
>>           INIT_LIST_HEAD(&info->e_wait_q[0].list);
>>           INIT_LIST_HEAD(&info->e_wait_q[1].list);
>> @@ -523,11 +523,11 @@ static void mqueue_evict_inode(struct inode *inode)
>>       ipc_ns = get_ns_from_inode(inode);
>>       info = MQUEUE_I(inode);
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       while ((msg = msg_get(info)) != NULL)
>>           list_add_tail(&msg->m_list, &tmp_msg);
>>       kfree(info->node_cache);
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>       list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
>>           list_del(&msg->m_list);
>> @@ -640,7 +640,7 @@ static ssize_t mqueue_read_file(struct file *filp, 
>> char __user *u_data,
>>       char buffer[FILENT_SIZE];
>>       ssize_t ret;
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       snprintf(buffer, sizeof(buffer),
>>               "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
>>               info->qsize,
>> @@ -649,7 +649,7 @@ static ssize_t mqueue_read_file(struct file *filp, 
>> char __user *u_data,
>>                info->notify.sigev_notify == SIGEV_SIGNAL) ?
>>                   info->notify.sigev_signo : 0,
>>               pid_vnr(info->notify_owner));
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>       buffer[sizeof(buffer)-1] = '\0';
>>       ret = simple_read_from_buffer(u_data, count, off, buffer,
>> @@ -665,11 +665,11 @@ static int mqueue_flush_file(struct file *filp, 
>> fl_owner_t id)
>>   {
>>       struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       if (task_tgid(current) == info->notify_owner)
>>           remove_notification(info);
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>       return 0;
>>   }
>> @@ -680,13 +680,13 @@ static __poll_t mqueue_poll_file(struct file 
>> *filp, struct poll_table_struct *po
>>       poll_wait(filp, &info->wait_q, poll_tab);
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       if (info->attr.mq_curmsgs)
>>           retval = EPOLLIN | EPOLLRDNORM;
>>       if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
>>           retval |= EPOLLOUT | EPOLLWRNORM;
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>       return retval;
>>   }
>> @@ -724,7 +724,7 @@ static int wq_sleep(struct mqueue_inode_info 
>> *info, int sr,
>>           /* memory barrier not required, we hold info->lock */
>>           __set_current_state(TASK_INTERRUPTIBLE);
>> -        spin_unlock(&info->lock);
>> +        mutex_unlock(&info->lock);
>>           time = schedule_hrtimeout_range_clock(timeout, 0,
>>               HRTIMER_MODE_ABS, CLOCK_REALTIME);
>> @@ -734,7 +734,7 @@ static int wq_sleep(struct mqueue_inode_info 
>> *info, int sr,
>>               retval = 0;
>>               goto out;
>>           }
>> -        spin_lock(&info->lock);
>> +        mutex_lock(&info->lock);
>>           /* we hold info->lock, so no memory barrier required */
>>           if (READ_ONCE(ewp->state) == STATE_READY) {
>> @@ -752,7 +752,7 @@ static int wq_sleep(struct mqueue_inode_info 
>> *info, int sr,
>>       }
>>       list_del(&ewp->list);
>>   out_unlock:
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>   out:
>>       return retval;
>>   }
>> @@ -1125,7 +1125,7 @@ static int do_mq_timedsend(mqd_t mqdes, const 
>> char __user *u_msg_ptr,
>>       if (!info->node_cache)
>>           new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       if (!info->node_cache && new_leaf) {
>>           /* Save our speculative allocation into the cache */
>> @@ -1166,7 +1166,7 @@ static int do_mq_timedsend(mqd_t mqdes, const 
>> char __user *u_msg_ptr,
>>           simple_inode_init_ts(inode);
>>       }
>>   out_unlock:
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>       wake_up_q(&wake_q);
>>   out_free:
>>       if (ret)
>> @@ -1230,7 +1230,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char 
>> __user *u_msg_ptr,
>>       if (!info->node_cache)
>>           new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       if (!info->node_cache && new_leaf) {
>>           /* Save our speculative allocation into the cache */
>> @@ -1242,7 +1242,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char 
>> __user *u_msg_ptr,
>>       if (info->attr.mq_curmsgs == 0) {
>>           if (f.file->f_flags & O_NONBLOCK) {
>> -            spin_unlock(&info->lock);
>> +            mutex_unlock(&info->lock);
>>               ret = -EAGAIN;
>>           } else {
>>               wait.task = current;
>> @@ -1261,7 +1261,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char 
>> __user *u_msg_ptr,
>>           /* There is now free space in queue. */
>>           pipelined_receive(&wake_q, info);
>> -        spin_unlock(&info->lock);
>> +        mutex_unlock(&info->lock);
>>           wake_up_q(&wake_q);
>>           ret = 0;
>>       }
>> @@ -1391,7 +1391,7 @@ static int do_mq_notify(mqd_t mqdes, const 
>> struct sigevent *notification)
>>       info = MQUEUE_I(inode);
>>       ret = 0;
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       if (notification == NULL) {
>>           if (info->notify_owner == task_tgid(current)) {
>>               remove_notification(info);
>> @@ -1424,7 +1424,7 @@ static int do_mq_notify(mqd_t mqdes, const 
>> struct sigevent *notification)
>>           info->notify_user_ns = get_user_ns(current_user_ns());
>>           inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
>>       }
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>   out_fput:
>>       fdput(f);
>>   out:
>> @@ -1470,7 +1470,7 @@ static int do_mq_getsetattr(int mqdes, struct 
>> mq_attr *new, struct mq_attr *old)
>>       inode = file_inode(f.file);
>>       info = MQUEUE_I(inode);
>> -    spin_lock(&info->lock);
>> +    mutex_lock(&info->lock);
>>       if (old) {
>>           *old = info->attr;
>> @@ -1488,7 +1488,7 @@ static int do_mq_getsetattr(int mqdes, struct 
>> mq_attr *new, struct mq_attr *old)
>>           inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
>>       }
>> -    spin_unlock(&info->lock);
>> +    mutex_unlock(&info->lock);
>>       fdput(f);
>>       return 0;
>>   }
>
  
Hillf Danton Jan. 18, 2024, 11:46 a.m. UTC | #3
On 2023/12/20 10:12, Zhengchao Shao wrote:
> I analyze the potential sleeping issue of the following processes:
> Thread A                                Thread B
> ...                                     netlink_create  //ref = 1
> do_mq_notify                            ...
>    sock = netlink_getsockbyfilp          ...     //ref = 2
>    info->notify_sock = sock;             ...
> ...                                     netlink_sendmsg
> ...                                       skb = netlink_alloc_large_skb  //skb->head is vmalloced
> ...                                       netlink_unicast
> ...                                         sk = netlink_getsockbyportid //ref = 3
> ...                                         netlink_sendskb
> ...                                           __netlink_sendskb
> ...                                             skb_queue_tail //put skb to sk_receive_queue
> ...                                         sock_put //ref = 2
> ...                                     ...
> ...                                     netlink_release
> ...                                       deferred_put_nlk_sk //ref = 1
> mqueue_flush_file
>    spin_lock
>    remove_notification
>      netlink_sendskb
>        sock_put  //ref = 0
>          sk_free
>            ...
>            __sk_destruct
>              netlink_sock_destruct
>                skb_queue_purge  //get skb from sk_receive_queue
>                  ...
>                  __skb_queue_purge_reason
>                    kfree_skb_reason
>                      __kfree_skb
>                      ...
>                      skb_release_all
>                        skb_release_head_state
>                          netlink_skb_destructor
>                            vfree(skb->head)  //sleeping while holding spinlock
> 
> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
> When the mqueue executes flush, the sleeping bug will occur. Use mutex
> lock instead of spin lock in mqueue_flush_file.

It makes no sense to replace spinlock with mutex just for putting sock.

Only for thoughts.

--- x/ipc/mqueue.c
+++ y/ipc/mqueue.c
@@ -663,12 +663,17 @@ static ssize_t mqueue_read_file(struct f
 static int mqueue_flush_file(struct file *filp, fl_owner_t id)
 {
 	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+	struct sock *sk = NULL;
 
 	spin_lock(&info->lock);
-	if (task_tgid(current) == info->notify_owner)
+	if (task_tgid(current) == info->notify_owner) {
+		sk = info->notify_sock;
+		sock_hold(sk);
 		remove_notification(info);
-
+	}
 	spin_unlock(&info->lock);
+	if (sk)
+		sock_put(sk);
 	return 0;
 }
  
shaozhengchao Jan. 19, 2024, 1:18 a.m. UTC | #4
On 2024/1/18 19:46, Hillf Danton wrote:
> On 2023/12/20 10:12, Zhengchao Shao wrote:
>> I analyze the potential sleeping issue of the following processes:
>> Thread A                                Thread B
>> ...                                     netlink_create  //ref = 1
>> do_mq_notify                            ...
>>     sock = netlink_getsockbyfilp          ...     //ref = 2
>>     info->notify_sock = sock;             ...
>> ...                                     netlink_sendmsg
>> ...                                       skb = netlink_alloc_large_skb  //skb->head is vmalloced
>> ...                                       netlink_unicast
>> ...                                         sk = netlink_getsockbyportid //ref = 3
>> ...                                         netlink_sendskb
>> ...                                           __netlink_sendskb
>> ...                                             skb_queue_tail //put skb to sk_receive_queue
>> ...                                         sock_put //ref = 2
>> ...                                     ...
>> ...                                     netlink_release
>> ...                                       deferred_put_nlk_sk //ref = 1
>> mqueue_flush_file
>>     spin_lock
>>     remove_notification
>>       netlink_sendskb
>>         sock_put  //ref = 0
>>           sk_free
>>             ...
>>             __sk_destruct
>>               netlink_sock_destruct
>>                 skb_queue_purge  //get skb from sk_receive_queue
>>                   ...
>>                   __skb_queue_purge_reason
>>                     kfree_skb_reason
>>                       __kfree_skb
>>                       ...
>>                       skb_release_all
>>                         skb_release_head_state
>>                           netlink_skb_destructor
>>                             vfree(skb->head)  //sleeping while holding spinlock
>>
>> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
>> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
>> When the mqueue executes flush, the sleeping bug will occur. Use mutex
>> lock instead of spin lock in mqueue_flush_file.
> 
> It makes no sense to replace spinlock with mutex just for putting sock.
> 
> Only for thoughts.
> 
> --- x/ipc/mqueue.c
> +++ y/ipc/mqueue.c
> @@ -663,12 +663,17 @@ static ssize_t mqueue_read_file(struct f
>   static int mqueue_flush_file(struct file *filp, fl_owner_t id)
>   {
>   	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
> +	struct sock *sk = NULL;
>   
>   	spin_lock(&info->lock);
> -	if (task_tgid(current) == info->notify_owner)
> +	if (task_tgid(current) == info->notify_owner) {
> +		sk = info->notify_sock;
> +		sock_hold(sk);
>   		remove_notification(info);
> -
> +	}
>   	spin_unlock(&info->lock);
> +	if (sk)
> +		sock_put(sk);
>   	return 0;
>   }
>   

Hi Hillf:
	Thank you for your review. Your changes are more concise and
clear. I will send V3.

Zhengchao Shao
  

Patch

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5eea4dc0509e..f6f92e3f82e4 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -118,9 +118,9 @@  struct posix_msg_tree_node {
  * Solution: use _release and _acquire barriers.
  *
  * 3) There is intentionally no barrier when setting current->state
- *    to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
+ *    to TASK_INTERRUPTIBLE: mutex_unlock(&info->lock) provides the
  *    release memory barrier, and the wakeup is triggered when holding
- *    info->lock, i.e. spin_lock(&info->lock) provided a pairing
+ *    info->lock, i.e. mutex_lock(&info->lock) provided a pairing
  *    acquire memory barrier.
  */
 
@@ -132,7 +132,7 @@  struct ext_wait_queue {		/* queue of sleeping tasks */
 };
 
 struct mqueue_inode_info {
-	spinlock_t lock;
+	struct mutex lock;
 	struct inode vfs_inode;
 	wait_queue_head_t wait_q;
 
@@ -312,7 +312,7 @@  static struct inode *mqueue_get_inode(struct super_block *sb,
 		inode->i_size = FILENT_SIZE;
 		/* mqueue specific info */
 		info = MQUEUE_I(inode);
-		spin_lock_init(&info->lock);
+		mutex_init(&info->lock);
 		init_waitqueue_head(&info->wait_q);
 		INIT_LIST_HEAD(&info->e_wait_q[0].list);
 		INIT_LIST_HEAD(&info->e_wait_q[1].list);
@@ -523,11 +523,11 @@  static void mqueue_evict_inode(struct inode *inode)
 
 	ipc_ns = get_ns_from_inode(inode);
 	info = MQUEUE_I(inode);
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 	while ((msg = msg_get(info)) != NULL)
 		list_add_tail(&msg->m_list, &tmp_msg);
 	kfree(info->node_cache);
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 
 	list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
 		list_del(&msg->m_list);
@@ -640,7 +640,7 @@  static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 	char buffer[FILENT_SIZE];
 	ssize_t ret;
 
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 	snprintf(buffer, sizeof(buffer),
 			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
 			info->qsize,
@@ -649,7 +649,7 @@  static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 			 info->notify.sigev_notify == SIGEV_SIGNAL) ?
 				info->notify.sigev_signo : 0,
 			pid_vnr(info->notify_owner));
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 	buffer[sizeof(buffer)-1] = '\0';
 
 	ret = simple_read_from_buffer(u_data, count, off, buffer,
@@ -665,11 +665,11 @@  static int mqueue_flush_file(struct file *filp, fl_owner_t id)
 {
 	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
 
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 	if (task_tgid(current) == info->notify_owner)
 		remove_notification(info);
 
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 	return 0;
 }
 
@@ -680,13 +680,13 @@  static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *po
 
 	poll_wait(filp, &info->wait_q, poll_tab);
 
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 	if (info->attr.mq_curmsgs)
 		retval = EPOLLIN | EPOLLRDNORM;
 
 	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
 		retval |= EPOLLOUT | EPOLLWRNORM;
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 
 	return retval;
 }
@@ -724,7 +724,7 @@  static int wq_sleep(struct mqueue_inode_info *info, int sr,
 		/* memory barrier not required, we hold info->lock */
 		__set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_unlock(&info->lock);
+		mutex_unlock(&info->lock);
 		time = schedule_hrtimeout_range_clock(timeout, 0,
 			HRTIMER_MODE_ABS, CLOCK_REALTIME);
 
@@ -734,7 +734,7 @@  static int wq_sleep(struct mqueue_inode_info *info, int sr,
 			retval = 0;
 			goto out;
 		}
-		spin_lock(&info->lock);
+		mutex_lock(&info->lock);
 
 		/* we hold info->lock, so no memory barrier required */
 		if (READ_ONCE(ewp->state) == STATE_READY) {
@@ -752,7 +752,7 @@  static int wq_sleep(struct mqueue_inode_info *info, int sr,
 	}
 	list_del(&ewp->list);
 out_unlock:
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 out:
 	return retval;
 }
@@ -1125,7 +1125,7 @@  static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 	if (!info->node_cache)
 		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
 
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 
 	if (!info->node_cache && new_leaf) {
 		/* Save our speculative allocation into the cache */
@@ -1166,7 +1166,7 @@  static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 		simple_inode_init_ts(inode);
 	}
 out_unlock:
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 	wake_up_q(&wake_q);
 out_free:
 	if (ret)
@@ -1230,7 +1230,7 @@  static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 	if (!info->node_cache)
 		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
 
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 
 	if (!info->node_cache && new_leaf) {
 		/* Save our speculative allocation into the cache */
@@ -1242,7 +1242,7 @@  static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 
 	if (info->attr.mq_curmsgs == 0) {
 		if (f.file->f_flags & O_NONBLOCK) {
-			spin_unlock(&info->lock);
+			mutex_unlock(&info->lock);
 			ret = -EAGAIN;
 		} else {
 			wait.task = current;
@@ -1261,7 +1261,7 @@  static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 
 		/* There is now free space in queue. */
 		pipelined_receive(&wake_q, info);
-		spin_unlock(&info->lock);
+		mutex_unlock(&info->lock);
 		wake_up_q(&wake_q);
 		ret = 0;
 	}
@@ -1391,7 +1391,7 @@  static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 	info = MQUEUE_I(inode);
 
 	ret = 0;
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 	if (notification == NULL) {
 		if (info->notify_owner == task_tgid(current)) {
 			remove_notification(info);
@@ -1424,7 +1424,7 @@  static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 		info->notify_user_ns = get_user_ns(current_user_ns());
 		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
 	}
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 out_fput:
 	fdput(f);
 out:
@@ -1470,7 +1470,7 @@  static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
 	inode = file_inode(f.file);
 	info = MQUEUE_I(inode);
 
-	spin_lock(&info->lock);
+	mutex_lock(&info->lock);
 
 	if (old) {
 		*old = info->attr;
@@ -1488,7 +1488,7 @@  static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
 		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 
-	spin_unlock(&info->lock);
+	mutex_unlock(&info->lock);
 	fdput(f);
 	return 0;
 }