[v2] ipc/mqueue: fix potential sleeping issue in mqueue_flush_file
Commit Message
I analyze the potential sleeping issue of the following processes:
Thread A Thread B
... netlink_create //ref = 1
do_mq_notify ...
sock = netlink_getsockbyfilp ... //ref = 2
info->notify_sock = sock; ...
... netlink_sendmsg
... skb = netlink_alloc_large_skb //skb->head is vmalloced
... netlink_unicast
... sk = netlink_getsockbyportid //ref = 3
... netlink_sendskb
... __netlink_sendskb
... skb_queue_tail //put skb to sk_receive_queue
... sock_put //ref = 2
... ...
... netlink_release
... deferred_put_nlk_sk //ref = 1
mqueue_flush_file
spin_lock
remove_notification
netlink_sendskb
sock_put //ref = 0
sk_free
...
__sk_destruct
netlink_sock_destruct
skb_queue_purge //get skb from sk_receive_queue
...
__skb_queue_purge_reason
kfree_skb_reason
__kfree_skb
...
skb_release_all
skb_release_head_state
netlink_skb_destructor
vfree(skb->head) //sleeping while holding spinlock
In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
When the mqueue executes flush, the sleeping bug will occur. Use mutex
lock instead of spin lock in mqueue_flush_file.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
---
v2: CCed some networking maintainer & netdev list
---
ipc/mqueue.c | 48 ++++++++++++++++++++++++------------------------
1 file changed, 24 insertions(+), 24 deletions(-)
Comments
+ping
Does anyone have ideas with this patch?
On 2023/12/20 10:12, Zhengchao Shao wrote:
> I analyze the potential sleeping issue of the following processes:
> Thread A Thread B
> ... netlink_create //ref = 1
> do_mq_notify ...
> sock = netlink_getsockbyfilp ... //ref = 2
> info->notify_sock = sock; ...
> ... netlink_sendmsg
> ... skb = netlink_alloc_large_skb //skb->head is vmalloced
> ... netlink_unicast
> ... sk = netlink_getsockbyportid //ref = 3
> ... netlink_sendskb
> ... __netlink_sendskb
> ... skb_queue_tail //put skb to sk_receive_queue
> ... sock_put //ref = 2
> ... ...
> ... netlink_release
> ... deferred_put_nlk_sk //ref = 1
> mqueue_flush_file
> spin_lock
> remove_notification
> netlink_sendskb
> sock_put //ref = 0
> sk_free
> ...
> __sk_destruct
> netlink_sock_destruct
> skb_queue_purge //get skb from sk_receive_queue
> ...
> __skb_queue_purge_reason
> kfree_skb_reason
> __kfree_skb
> ...
> skb_release_all
> skb_release_head_state
> netlink_skb_destructor
> vfree(skb->head) //sleeping while holding spinlock
>
> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
> When the mqueue executes flush, the sleeping bug will occur. Use mutex
> lock instead of spin lock in mqueue_flush_file.
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
> ---
> v2: CCed some networking maintainer & netdev list
> ---
> ipc/mqueue.c | 48 ++++++++++++++++++++++++------------------------
> 1 file changed, 24 insertions(+), 24 deletions(-)
>
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 5eea4dc0509e..f6f92e3f82e4 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -118,9 +118,9 @@ struct posix_msg_tree_node {
> * Solution: use _release and _acquire barriers.
> *
> * 3) There is intentionally no barrier when setting current->state
> - * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
> + * to TASK_INTERRUPTIBLE: mutex_unlock(&info->lock) provides the
> * release memory barrier, and the wakeup is triggered when holding
> - * info->lock, i.e. spin_lock(&info->lock) provided a pairing
> + * info->lock, i.e. mutex_lock(&info->lock) provided a pairing
> * acquire memory barrier.
> */
>
> @@ -132,7 +132,7 @@ struct ext_wait_queue { /* queue of sleeping tasks */
> };
>
> struct mqueue_inode_info {
> - spinlock_t lock;
> + struct mutex lock;
> struct inode vfs_inode;
> wait_queue_head_t wait_q;
>
> @@ -312,7 +312,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
> inode->i_size = FILENT_SIZE;
> /* mqueue specific info */
> info = MQUEUE_I(inode);
> - spin_lock_init(&info->lock);
> + mutex_init(&info->lock);
> init_waitqueue_head(&info->wait_q);
> INIT_LIST_HEAD(&info->e_wait_q[0].list);
> INIT_LIST_HEAD(&info->e_wait_q[1].list);
> @@ -523,11 +523,11 @@ static void mqueue_evict_inode(struct inode *inode)
>
> ipc_ns = get_ns_from_inode(inode);
> info = MQUEUE_I(inode);
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
> while ((msg = msg_get(info)) != NULL)
> list_add_tail(&msg->m_list, &tmp_msg);
> kfree(info->node_cache);
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
>
> list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
> list_del(&msg->m_list);
> @@ -640,7 +640,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
> char buffer[FILENT_SIZE];
> ssize_t ret;
>
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
> snprintf(buffer, sizeof(buffer),
> "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
> info->qsize,
> @@ -649,7 +649,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
> info->notify.sigev_notify == SIGEV_SIGNAL) ?
> info->notify.sigev_signo : 0,
> pid_vnr(info->notify_owner));
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> buffer[sizeof(buffer)-1] = '\0';
>
> ret = simple_read_from_buffer(u_data, count, off, buffer,
> @@ -665,11 +665,11 @@ static int mqueue_flush_file(struct file *filp, fl_owner_t id)
> {
> struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
>
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
> if (task_tgid(current) == info->notify_owner)
> remove_notification(info);
>
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> return 0;
> }
>
> @@ -680,13 +680,13 @@ static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *po
>
> poll_wait(filp, &info->wait_q, poll_tab);
>
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
> if (info->attr.mq_curmsgs)
> retval = EPOLLIN | EPOLLRDNORM;
>
> if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
> retval |= EPOLLOUT | EPOLLWRNORM;
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
>
> return retval;
> }
> @@ -724,7 +724,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
> /* memory barrier not required, we hold info->lock */
> __set_current_state(TASK_INTERRUPTIBLE);
>
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> time = schedule_hrtimeout_range_clock(timeout, 0,
> HRTIMER_MODE_ABS, CLOCK_REALTIME);
>
> @@ -734,7 +734,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
> retval = 0;
> goto out;
> }
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
>
> /* we hold info->lock, so no memory barrier required */
> if (READ_ONCE(ewp->state) == STATE_READY) {
> @@ -752,7 +752,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
> }
> list_del(&ewp->list);
> out_unlock:
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> out:
> return retval;
> }
> @@ -1125,7 +1125,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
> if (!info->node_cache)
> new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
>
> if (!info->node_cache && new_leaf) {
> /* Save our speculative allocation into the cache */
> @@ -1166,7 +1166,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
> simple_inode_init_ts(inode);
> }
> out_unlock:
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> wake_up_q(&wake_q);
> out_free:
> if (ret)
> @@ -1230,7 +1230,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
> if (!info->node_cache)
> new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
>
> if (!info->node_cache && new_leaf) {
> /* Save our speculative allocation into the cache */
> @@ -1242,7 +1242,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
>
> if (info->attr.mq_curmsgs == 0) {
> if (f.file->f_flags & O_NONBLOCK) {
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> ret = -EAGAIN;
> } else {
> wait.task = current;
> @@ -1261,7 +1261,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
>
> /* There is now free space in queue. */
> pipelined_receive(&wake_q, info);
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> wake_up_q(&wake_q);
> ret = 0;
> }
> @@ -1391,7 +1391,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
> info = MQUEUE_I(inode);
>
> ret = 0;
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
> if (notification == NULL) {
> if (info->notify_owner == task_tgid(current)) {
> remove_notification(info);
> @@ -1424,7 +1424,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
> info->notify_user_ns = get_user_ns(current_user_ns());
> inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
> }
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> out_fput:
> fdput(f);
> out:
> @@ -1470,7 +1470,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
> inode = file_inode(f.file);
> info = MQUEUE_I(inode);
>
> - spin_lock(&info->lock);
> + mutex_lock(&info->lock);
>
> if (old) {
> *old = info->attr;
> @@ -1488,7 +1488,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
> inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
> }
>
> - spin_unlock(&info->lock);
> + mutex_unlock(&info->lock);
> fdput(f);
> return 0;
> }
++ping
Does anyone have ideas with this patch?
On 2024/1/2 9:09, shaozhengchao wrote:
> +ping
>
> Does anyone have ideas with this patch?
>
> On 2023/12/20 10:12, Zhengchao Shao wrote:
>> I analyze the potential sleeping issue of the following processes:
>> Thread A Thread B
>> ... netlink_create //ref = 1
>> do_mq_notify ...
>> sock = netlink_getsockbyfilp ... //ref = 2
>> info->notify_sock = sock; ...
>> ... netlink_sendmsg
>> ... skb =
>> netlink_alloc_large_skb //skb->head is vmalloced
>> ... netlink_unicast
>> ... sk =
>> netlink_getsockbyportid //ref = 3
>> ... netlink_sendskb
>> ... __netlink_sendskb
>> ... skb_queue_tail //put
>> skb to sk_receive_queue
>> ... sock_put //ref = 2
>> ... ...
>> ... netlink_release
>> ... deferred_put_nlk_sk //ref = 1
>> mqueue_flush_file
>> spin_lock
>> remove_notification
>> netlink_sendskb
>> sock_put //ref = 0
>> sk_free
>> ...
>> __sk_destruct
>> netlink_sock_destruct
>> skb_queue_purge //get skb from sk_receive_queue
>> ...
>> __skb_queue_purge_reason
>> kfree_skb_reason
>> __kfree_skb
>> ...
>> skb_release_all
>> skb_release_head_state
>> netlink_skb_destructor
>> vfree(skb->head) //sleeping while holding
>> spinlock
>>
>> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
>> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
>> When the mqueue executes flush, the sleeping bug will occur. Use mutex
>> lock instead of spin lock in mqueue_flush_file.
>>
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>> Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
>> ---
>> v2: CCed some networking maintainer & netdev list
>> ---
>> ipc/mqueue.c | 48 ++++++++++++++++++++++++------------------------
>> 1 file changed, 24 insertions(+), 24 deletions(-)
>>
>> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
>> index 5eea4dc0509e..f6f92e3f82e4 100644
>> --- a/ipc/mqueue.c
>> +++ b/ipc/mqueue.c
>> @@ -118,9 +118,9 @@ struct posix_msg_tree_node {
>> * Solution: use _release and _acquire barriers.
>> *
>> * 3) There is intentionally no barrier when setting current->state
>> - * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
>> + * to TASK_INTERRUPTIBLE: mutex_unlock(&info->lock) provides the
>> * release memory barrier, and the wakeup is triggered when holding
>> - * info->lock, i.e. spin_lock(&info->lock) provided a pairing
>> + * info->lock, i.e. mutex_lock(&info->lock) provided a pairing
>> * acquire memory barrier.
>> */
>> @@ -132,7 +132,7 @@ struct ext_wait_queue { /* queue of
>> sleeping tasks */
>> };
>> struct mqueue_inode_info {
>> - spinlock_t lock;
>> + struct mutex lock;
>> struct inode vfs_inode;
>> wait_queue_head_t wait_q;
>> @@ -312,7 +312,7 @@ static struct inode *mqueue_get_inode(struct
>> super_block *sb,
>> inode->i_size = FILENT_SIZE;
>> /* mqueue specific info */
>> info = MQUEUE_I(inode);
>> - spin_lock_init(&info->lock);
>> + mutex_init(&info->lock);
>> init_waitqueue_head(&info->wait_q);
>> INIT_LIST_HEAD(&info->e_wait_q[0].list);
>> INIT_LIST_HEAD(&info->e_wait_q[1].list);
>> @@ -523,11 +523,11 @@ static void mqueue_evict_inode(struct inode *inode)
>> ipc_ns = get_ns_from_inode(inode);
>> info = MQUEUE_I(inode);
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> while ((msg = msg_get(info)) != NULL)
>> list_add_tail(&msg->m_list, &tmp_msg);
>> kfree(info->node_cache);
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
>> list_del(&msg->m_list);
>> @@ -640,7 +640,7 @@ static ssize_t mqueue_read_file(struct file *filp,
>> char __user *u_data,
>> char buffer[FILENT_SIZE];
>> ssize_t ret;
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> snprintf(buffer, sizeof(buffer),
>> "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
>> info->qsize,
>> @@ -649,7 +649,7 @@ static ssize_t mqueue_read_file(struct file *filp,
>> char __user *u_data,
>> info->notify.sigev_notify == SIGEV_SIGNAL) ?
>> info->notify.sigev_signo : 0,
>> pid_vnr(info->notify_owner));
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> buffer[sizeof(buffer)-1] = '\0';
>> ret = simple_read_from_buffer(u_data, count, off, buffer,
>> @@ -665,11 +665,11 @@ static int mqueue_flush_file(struct file *filp,
>> fl_owner_t id)
>> {
>> struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> if (task_tgid(current) == info->notify_owner)
>> remove_notification(info);
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> return 0;
>> }
>> @@ -680,13 +680,13 @@ static __poll_t mqueue_poll_file(struct file
>> *filp, struct poll_table_struct *po
>> poll_wait(filp, &info->wait_q, poll_tab);
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> if (info->attr.mq_curmsgs)
>> retval = EPOLLIN | EPOLLRDNORM;
>> if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
>> retval |= EPOLLOUT | EPOLLWRNORM;
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> return retval;
>> }
>> @@ -724,7 +724,7 @@ static int wq_sleep(struct mqueue_inode_info
>> *info, int sr,
>> /* memory barrier not required, we hold info->lock */
>> __set_current_state(TASK_INTERRUPTIBLE);
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> time = schedule_hrtimeout_range_clock(timeout, 0,
>> HRTIMER_MODE_ABS, CLOCK_REALTIME);
>> @@ -734,7 +734,7 @@ static int wq_sleep(struct mqueue_inode_info
>> *info, int sr,
>> retval = 0;
>> goto out;
>> }
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> /* we hold info->lock, so no memory barrier required */
>> if (READ_ONCE(ewp->state) == STATE_READY) {
>> @@ -752,7 +752,7 @@ static int wq_sleep(struct mqueue_inode_info
>> *info, int sr,
>> }
>> list_del(&ewp->list);
>> out_unlock:
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> out:
>> return retval;
>> }
>> @@ -1125,7 +1125,7 @@ static int do_mq_timedsend(mqd_t mqdes, const
>> char __user *u_msg_ptr,
>> if (!info->node_cache)
>> new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> if (!info->node_cache && new_leaf) {
>> /* Save our speculative allocation into the cache */
>> @@ -1166,7 +1166,7 @@ static int do_mq_timedsend(mqd_t mqdes, const
>> char __user *u_msg_ptr,
>> simple_inode_init_ts(inode);
>> }
>> out_unlock:
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> wake_up_q(&wake_q);
>> out_free:
>> if (ret)
>> @@ -1230,7 +1230,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char
>> __user *u_msg_ptr,
>> if (!info->node_cache)
>> new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> if (!info->node_cache && new_leaf) {
>> /* Save our speculative allocation into the cache */
>> @@ -1242,7 +1242,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char
>> __user *u_msg_ptr,
>> if (info->attr.mq_curmsgs == 0) {
>> if (f.file->f_flags & O_NONBLOCK) {
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> ret = -EAGAIN;
>> } else {
>> wait.task = current;
>> @@ -1261,7 +1261,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char
>> __user *u_msg_ptr,
>> /* There is now free space in queue. */
>> pipelined_receive(&wake_q, info);
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> wake_up_q(&wake_q);
>> ret = 0;
>> }
>> @@ -1391,7 +1391,7 @@ static int do_mq_notify(mqd_t mqdes, const
>> struct sigevent *notification)
>> info = MQUEUE_I(inode);
>> ret = 0;
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> if (notification == NULL) {
>> if (info->notify_owner == task_tgid(current)) {
>> remove_notification(info);
>> @@ -1424,7 +1424,7 @@ static int do_mq_notify(mqd_t mqdes, const
>> struct sigevent *notification)
>> info->notify_user_ns = get_user_ns(current_user_ns());
>> inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
>> }
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> out_fput:
>> fdput(f);
>> out:
>> @@ -1470,7 +1470,7 @@ static int do_mq_getsetattr(int mqdes, struct
>> mq_attr *new, struct mq_attr *old)
>> inode = file_inode(f.file);
>> info = MQUEUE_I(inode);
>> - spin_lock(&info->lock);
>> + mutex_lock(&info->lock);
>> if (old) {
>> *old = info->attr;
>> @@ -1488,7 +1488,7 @@ static int do_mq_getsetattr(int mqdes, struct
>> mq_attr *new, struct mq_attr *old)
>> inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
>> }
>> - spin_unlock(&info->lock);
>> + mutex_unlock(&info->lock);
>> fdput(f);
>> return 0;
>> }
>
On 2023/12/20 10:12, Zhengchao Shao wrote:
> I analyze the potential sleeping issue of the following processes:
> Thread A Thread B
> ... netlink_create //ref = 1
> do_mq_notify ...
> sock = netlink_getsockbyfilp ... //ref = 2
> info->notify_sock = sock; ...
> ... netlink_sendmsg
> ... skb = netlink_alloc_large_skb //skb->head is vmalloced
> ... netlink_unicast
> ... sk = netlink_getsockbyportid //ref = 3
> ... netlink_sendskb
> ... __netlink_sendskb
> ... skb_queue_tail //put skb to sk_receive_queue
> ... sock_put //ref = 2
> ... ...
> ... netlink_release
> ... deferred_put_nlk_sk //ref = 1
> mqueue_flush_file
> spin_lock
> remove_notification
> netlink_sendskb
> sock_put //ref = 0
> sk_free
> ...
> __sk_destruct
> netlink_sock_destruct
> skb_queue_purge //get skb from sk_receive_queue
> ...
> __skb_queue_purge_reason
> kfree_skb_reason
> __kfree_skb
> ...
> skb_release_all
> skb_release_head_state
> netlink_skb_destructor
> vfree(skb->head) //sleeping while holding spinlock
>
> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
> When the mqueue executes flush, the sleeping bug will occur. Use mutex
> lock instead of spin lock in mqueue_flush_file.
It makes no sense to replace spinlock with mutex just for putting sock.
Only for thoughts.
--- x/ipc/mqueue.c
+++ y/ipc/mqueue.c
@@ -663,12 +663,17 @@ static ssize_t mqueue_read_file(struct f
static int mqueue_flush_file(struct file *filp, fl_owner_t id)
{
struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+ struct sock *sk = NULL;
spin_lock(&info->lock);
- if (task_tgid(current) == info->notify_owner)
+ if (task_tgid(current) == info->notify_owner) {
+ sk = info->notify_sock;
+ sock_hold(sk);
remove_notification(info);
-
+ }
spin_unlock(&info->lock);
+ if (sk)
+ sock_put(sk);
return 0;
}
On 2024/1/18 19:46, Hillf Danton wrote:
> On 2023/12/20 10:12, Zhengchao Shao wrote:
>> I analyze the potential sleeping issue of the following processes:
>> Thread A Thread B
>> ... netlink_create //ref = 1
>> do_mq_notify ...
>> sock = netlink_getsockbyfilp ... //ref = 2
>> info->notify_sock = sock; ...
>> ... netlink_sendmsg
>> ... skb = netlink_alloc_large_skb //skb->head is vmalloced
>> ... netlink_unicast
>> ... sk = netlink_getsockbyportid //ref = 3
>> ... netlink_sendskb
>> ... __netlink_sendskb
>> ... skb_queue_tail //put skb to sk_receive_queue
>> ... sock_put //ref = 2
>> ... ...
>> ... netlink_release
>> ... deferred_put_nlk_sk //ref = 1
>> mqueue_flush_file
>> spin_lock
>> remove_notification
>> netlink_sendskb
>> sock_put //ref = 0
>> sk_free
>> ...
>> __sk_destruct
>> netlink_sock_destruct
>> skb_queue_purge //get skb from sk_receive_queue
>> ...
>> __skb_queue_purge_reason
>> kfree_skb_reason
>> __kfree_skb
>> ...
>> skb_release_all
>> skb_release_head_state
>> netlink_skb_destructor
>> vfree(skb->head) //sleeping while holding spinlock
>>
>> In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
>> vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
>> When the mqueue executes flush, the sleeping bug will occur. Use mutex
>> lock instead of spin lock in mqueue_flush_file.
>
> It makes no sense to replace spinlock with mutex just for putting sock.
>
> Only for thoughts.
>
> --- x/ipc/mqueue.c
> +++ y/ipc/mqueue.c
> @@ -663,12 +663,17 @@ static ssize_t mqueue_read_file(struct f
> static int mqueue_flush_file(struct file *filp, fl_owner_t id)
> {
> struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
> + struct sock *sk = NULL;
>
> spin_lock(&info->lock);
> - if (task_tgid(current) == info->notify_owner)
> + if (task_tgid(current) == info->notify_owner) {
> + sk = info->notify_sock;
> + sock_hold(sk);
> remove_notification(info);
> -
> + }
> spin_unlock(&info->lock);
> + if (sk)
> + sock_put(sk);
> return 0;
> }
>
Hi Hillf:
Thank you for your review. Your changes are more concise and
clear. I will send V3.
Zhengchao Shao
@@ -118,9 +118,9 @@ struct posix_msg_tree_node {
* Solution: use _release and _acquire barriers.
*
* 3) There is intentionally no barrier when setting current->state
- * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
+ * to TASK_INTERRUPTIBLE: mutex_unlock(&info->lock) provides the
* release memory barrier, and the wakeup is triggered when holding
- * info->lock, i.e. spin_lock(&info->lock) provided a pairing
+ * info->lock, i.e. mutex_lock(&info->lock) provided a pairing
* acquire memory barrier.
*/
@@ -132,7 +132,7 @@ struct ext_wait_queue { /* queue of sleeping tasks */
};
struct mqueue_inode_info {
- spinlock_t lock;
+ struct mutex lock;
struct inode vfs_inode;
wait_queue_head_t wait_q;
@@ -312,7 +312,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
inode->i_size = FILENT_SIZE;
/* mqueue specific info */
info = MQUEUE_I(inode);
- spin_lock_init(&info->lock);
+ mutex_init(&info->lock);
init_waitqueue_head(&info->wait_q);
INIT_LIST_HEAD(&info->e_wait_q[0].list);
INIT_LIST_HEAD(&info->e_wait_q[1].list);
@@ -523,11 +523,11 @@ static void mqueue_evict_inode(struct inode *inode)
ipc_ns = get_ns_from_inode(inode);
info = MQUEUE_I(inode);
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
while ((msg = msg_get(info)) != NULL)
list_add_tail(&msg->m_list, &tmp_msg);
kfree(info->node_cache);
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
list_del(&msg->m_list);
@@ -640,7 +640,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
char buffer[FILENT_SIZE];
ssize_t ret;
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
snprintf(buffer, sizeof(buffer),
"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
info->qsize,
@@ -649,7 +649,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
info->notify.sigev_notify == SIGEV_SIGNAL) ?
info->notify.sigev_signo : 0,
pid_vnr(info->notify_owner));
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
buffer[sizeof(buffer)-1] = '\0';
ret = simple_read_from_buffer(u_data, count, off, buffer,
@@ -665,11 +665,11 @@ static int mqueue_flush_file(struct file *filp, fl_owner_t id)
{
struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
if (task_tgid(current) == info->notify_owner)
remove_notification(info);
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
return 0;
}
@@ -680,13 +680,13 @@ static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *po
poll_wait(filp, &info->wait_q, poll_tab);
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
if (info->attr.mq_curmsgs)
retval = EPOLLIN | EPOLLRDNORM;
if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
retval |= EPOLLOUT | EPOLLWRNORM;
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
return retval;
}
@@ -724,7 +724,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
/* memory barrier not required, we hold info->lock */
__set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
time = schedule_hrtimeout_range_clock(timeout, 0,
HRTIMER_MODE_ABS, CLOCK_REALTIME);
@@ -734,7 +734,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
retval = 0;
goto out;
}
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
/* we hold info->lock, so no memory barrier required */
if (READ_ONCE(ewp->state) == STATE_READY) {
@@ -752,7 +752,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
}
list_del(&ewp->list);
out_unlock:
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
out:
return retval;
}
@@ -1125,7 +1125,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
if (!info->node_cache)
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
if (!info->node_cache && new_leaf) {
/* Save our speculative allocation into the cache */
@@ -1166,7 +1166,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
simple_inode_init_ts(inode);
}
out_unlock:
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
wake_up_q(&wake_q);
out_free:
if (ret)
@@ -1230,7 +1230,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
if (!info->node_cache)
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
if (!info->node_cache && new_leaf) {
/* Save our speculative allocation into the cache */
@@ -1242,7 +1242,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
if (info->attr.mq_curmsgs == 0) {
if (f.file->f_flags & O_NONBLOCK) {
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
ret = -EAGAIN;
} else {
wait.task = current;
@@ -1261,7 +1261,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
/* There is now free space in queue. */
pipelined_receive(&wake_q, info);
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
wake_up_q(&wake_q);
ret = 0;
}
@@ -1391,7 +1391,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
info = MQUEUE_I(inode);
ret = 0;
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
if (notification == NULL) {
if (info->notify_owner == task_tgid(current)) {
remove_notification(info);
@@ -1424,7 +1424,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
info->notify_user_ns = get_user_ns(current_user_ns());
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
}
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
out_fput:
fdput(f);
out:
@@ -1470,7 +1470,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
inode = file_inode(f.file);
info = MQUEUE_I(inode);
- spin_lock(&info->lock);
+ mutex_lock(&info->lock);
if (old) {
*old = info->attr;
@@ -1488,7 +1488,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
}
- spin_unlock(&info->lock);
+ mutex_unlock(&info->lock);
fdput(f);
return 0;
}