[v4,5/5] spi: spi-qcom-qspi: Add DMA mode support

Message ID 1681996394-13099-6-git-send-email-quic_vnivarth@quicinc.com
State New
Headers
Series spi: Add DMA mode support to spi-qcom-qspi |

Commit Message

Vijaya Krishna Nivarthi April 20, 2023, 1:13 p.m. UTC
  Current driver supports only PIO mode.

HW supports DMA, so add DMA mode support to the driver
for better performance for larger xfers.

Signed-off-by: Vijaya Krishna Nivarthi <quic_vnivarth@quicinc.com>
---
v3 -> v4:
- corrected tabs spacing
- dropped dma data descriptors
- dropped INVALID from xfer_mode enum
- qspi_buswidth_to_iomode() to return iomode without shifting
- dropped non-functional change in qcom_qspi_set_speed()
- drop redundant call to wmb()
- fallback to pio if dma fails to setup
- use dmam_pool_create() the devm version
- thus drop dma_pool_destroy()
- set dma_alignment field in probe()
- other minor changes

v2 -> v3:
- dropped Reported-by tag

v1 -> v2:
- modified commit message
- addressed kernel test robot error
- correct placement of header file includes
- added more comments
- coding style related changes
- renamed variables
- used u32/u8 instead of uint32_t/8_t
- removed unnecessary casting
- retain same MSTR_CONFIG as PIO for DMA
- unset fragment bit only for last cmd_desc of last xfer
---
 drivers/spi/spi-qcom-qspi.c | 292 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 265 insertions(+), 27 deletions(-)
  

Comments

Doug Anderson April 20, 2023, 5:19 p.m. UTC | #1
Hi,

On Thu, Apr 20, 2023 at 6:13 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> @@ -137,11 +155,29 @@ enum qspi_clocks {
>         QSPI_NUM_CLKS
>  };
>
> +enum qspi_xfer_mode {
> +       QSPI_FIFO,
> +       QSPI_DMA
> +};
> +
> +/*
> + * Number of entries in sgt returned from spi framework that-
> + * will be supported. Can be modified as required.
> + * In practice, given max_dma_len is 64KB, the number of
> + * entries is not expected to exceed 1.
> + */
> +#define QSPI_MAX_SG 5

I actually wonder if this would be more nicely done just using a
linked list, which naturally mirrors how SGs work anyway. You'd add
"struct list_head" to the end of "struct qspi_cmd_desc" and just store
a pointer to the head in "struct qcom_qspi".

For freeing, you can always get back the "virtual" address because
it's just the address of each node. You can always get back the
physical address because it's stored in "data_address".


> @@ -223,6 +261,16 @@ static void qcom_qspi_handle_err(struct spi_master *master,
>         spin_lock_irqsave(&ctrl->lock, flags);
>         writel(0, ctrl->base + MSTR_INT_EN);

Can you also clear all interrupts here? That will make sure that if
the interrupt somehow fires after you run that it will detect that
there's nothing to do.


>         ctrl->xfer.rem_bytes = 0;
> +
> +       if (ctrl->xfer_mode == QSPI_DMA) {
> +               int i;
> +
> +               /* free cmd descriptors */
> +               for (i = 0; i < ctrl->n_cmd_desc; i++)
> +                       dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
> +                                         ctrl->dma_cmd_desc[i]);
> +               ctrl->n_cmd_desc = 0;
> +       }

Instead of checking for ctrl->xfer_mode, why not just check for
ctrl->n_cmd_desc? Then you can get rid of "ctrl->xfer_mode".


> @@ -258,6 +306,120 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>         return 0;
>  }
>
> +#define QSPI_ALIGN_REQ 32

nit: put this at the top of the file with other #defines.


> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, dma_addr_t dma_ptr,
> +                       uint32_t n_bytes)
> +{
> +       struct qspi_cmd_desc *virt_cmd_desc, *prev;
> +       dma_addr_t dma_cmd_desc;
> +
> +       /* allocate for dma cmd descriptor */
> +       virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
> +               GFP_KERNEL, &dma_cmd_desc);

Remove unnecessary cast; "void *" assigns fine w/out a cast.

Add "| GFP_ZERO" and then get rid of the need to clear the "reserved"
and "next_descriptor" stuff below.


> +       if (!virt_cmd_desc) {
> +               dev_err(ctrl->dev,
> +                       "Could not allocate for cmd_desc\n");
> +               return -ENOMEM;
> +       }

You never need to add an extra message for allocation failures (they
already splat). Remove it.


> +       ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
> +       ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
> +       ctrl->n_cmd_desc++;
> +
> +       /* setup cmd descriptor */
> +       virt_cmd_desc->data_address = dma_ptr;
> +       virt_cmd_desc->next_descriptor = 0;
> +       virt_cmd_desc->direction = ctrl->xfer.dir;
> +       virt_cmd_desc->multi_io_mode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
> +       virt_cmd_desc->reserved1 = 0;
> +       virt_cmd_desc->fragment = ctrl->xfer.is_last ? 0 : 1;

virt_cmd_desc->fragment = !ctrl->xfer.is_last;


> +       virt_cmd_desc->reserved2 = 0;
> +       virt_cmd_desc->length = n_bytes;
> +
> +       /* update previous descriptor */
> +       if (ctrl->n_cmd_desc >= 2) {
> +               prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
> +               prev->next_descriptor = dma_cmd_desc;
> +               prev->fragment = 1;
> +       }
> +
> +       return 0;
> +}
> +
> +static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
> +                               struct spi_transfer *xfer)
> +{
> +       int ret;
> +       struct sg_table *sgt;
> +       unsigned int sg_total_len = 0;
> +       dma_addr_t dma_ptr_sg;
> +       unsigned int dma_len_sg;
> +       int i;
> +
> +       if (ctrl->n_cmd_desc) {
> +               dev_err(ctrl->dev, "Remnant dma buffers n_cmd_desc-%d\n", ctrl->n_cmd_desc);
> +               return -EIO;
> +       }
> +
> +       sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
> +       if (!sgt->nents || sgt->nents > QSPI_MAX_SG) {
> +               dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
> +               return -EAGAIN;

If you're retrying, don't use "dev_err" but instead "dev_warn".
Similar in other places.


> +       }
> +
> +       for (i = 0; i < sgt->nents; i++) {
> +               dma_ptr_sg = sg_dma_address(sgt->sgl + i);
> +               if (!IS_ALIGNED(dma_ptr_sg, QSPI_ALIGN_REQ)) {
> +                       dev_err(ctrl->dev, "dma address-%pad not aligned to %d\n",
> +                               &dma_ptr_sg, QSPI_ALIGN_REQ);

In general it's not good practice to put pointer values into the error
log as it can be an attack vector. Probably the %p will be replaced
with something bogus anyway, but it'll look weird.


> +                       return -EAGAIN;
> +               }
> +               sg_total_len += sg_dma_len(sgt->sgl + i);
> +       }
> +
> +       if (sg_total_len != xfer->len) {
> +               dev_err(ctrl->dev, "Data lengths mismatch\n");
> +               return -EAGAIN;
> +       }

This feels like overly defensive programming. The SPI framework is
what's in charge of setting up the scatter gather lists and it can be
trusted to give you something where the total transfer length matches.
IMO, drop that validation. I'm OK w/ keeping the double-check of the
alignment since that's handled by the client drivers and they are less
trustworthy.


> +
> +       for (i = 0; i < sgt->nents; i++) {
> +               dma_ptr_sg = sg_dma_address(sgt->sgl + i);
> +               dma_len_sg = sg_dma_len(sgt->sgl + i);
> +
> +               ret = qcom_qspi_alloc_desc(ctrl, dma_ptr_sg, dma_len_sg);
> +               if (ret)
> +                       goto cleanup;
> +       }
> +       return 0;
> +
> +cleanup:
> +       dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");

Drop above print--we should have already printed any relevant errors.


> @@ -290,8 +454,37 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
>         ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
>                                           &master->cur_msg->transfers);
>         ctrl->xfer.rem_bytes = xfer->len;
> +
> +       if (xfer->rx_sg.nents || xfer->tx_sg.nents) {
> +               /* do DMA transfer */
> +               ctrl->xfer_mode = QSPI_DMA;
> +               if (!(mstr_cfg & DMA_ENABLE)) {
> +                       mstr_cfg |= DMA_ENABLE;
> +                       writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
> +               }
> +
> +               ret = qcom_qspi_setup_dma_desc(ctrl, xfer);
> +               if (ret) {
> +                       if (ret == -EAGAIN) {
> +                               dev_err_once(ctrl->dev, "DMA failure, falling back to PIO");
> +                               goto do_pio;
> +                       }
> +                       spin_unlock_irqrestore(&ctrl->lock, flags);
> +                       return ret;
> +               }
> +               qcom_qspi_dma_xfer(ctrl);
> +               goto end;
> +       }
> +
> +do_pio:

A bit nitty, but the "do_pio" label feels like a slight stretch from
what I consider the acceptable uses of "goto". Maybe it's better to
avoid it? The "end" label is OK w/ me, though usually I see it called
"exit". AKA:

  ...
  ret = qcom_qspi_setup_dma_desc(ctrl, xfer);
  if (ret != -EAGAIN) {
    if (!ret)
      qcom_qspi_dma_xfer(ctrl);
    goto exit;
  }
  dev_warn_once(...);
  ret = 0; /* We'll retry w/ PIO */
}

...
...

exit:
  spin_unlock_irqrestore(&ctrl->lock, flags);

  if (ret)
    return ret;

  /* We'll call spi_finalize_current_transfer() when done */
  return 1;


> @@ -328,6 +521,17 @@ static int qcom_qspi_prepare_message(struct spi_master *master,
>         return 0;
>  }
>
> +static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
> +{
> +       /* allocate for cmd descriptors pool */

The above comment doesn't add much. Drop?


> @@ -426,27 +630,48 @@ static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
>         int_status = readl(ctrl->base + MSTR_INT_STATUS);
>         writel(int_status, ctrl->base + MSTR_INT_STATUS);
>
> -       if (ctrl->xfer.dir == QSPI_WRITE) {
> -               if (int_status & WR_FIFO_EMPTY)
> -                       ret = pio_write(ctrl);
> -       } else {
> -               if (int_status & RESP_FIFO_RDY)
> -                       ret = pio_read(ctrl);
> -       }
> -
> -       if (int_status & QSPI_ERR_IRQS) {
> -               if (int_status & RESP_FIFO_UNDERRUN)
> -                       dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
> -               if (int_status & WR_FIFO_OVERRUN)
> -                       dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
> -               if (int_status & HRESP_FROM_NOC_ERR)
> -                       dev_err(ctrl->dev, "IRQ error: NOC response error\n");
> -               ret = IRQ_HANDLED;
> -       }
> -
> -       if (!ctrl->xfer.rem_bytes) {
> -               writel(0, ctrl->base + MSTR_INT_EN);
> -               spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
> +       switch (ctrl->xfer_mode) {
> +       case QSPI_FIFO:
> +               if (ctrl->xfer.dir == QSPI_WRITE) {
> +                       if (int_status & WR_FIFO_EMPTY)
> +                               ret = pio_write(ctrl);
> +               } else {
> +                       if (int_status & RESP_FIFO_RDY)
> +                               ret = pio_read(ctrl);
> +               }
> +
> +               if (int_status & QSPI_ERR_IRQS) {
> +                       if (int_status & RESP_FIFO_UNDERRUN)
> +                               dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
> +                       if (int_status & WR_FIFO_OVERRUN)
> +                               dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
> +                       if (int_status & HRESP_FROM_NOC_ERR)
> +                               dev_err(ctrl->dev, "IRQ error: NOC response error\n");
> +                       ret = IRQ_HANDLED;
> +               }
> +
> +               if (!ctrl->xfer.rem_bytes) {
> +                       writel(0, ctrl->base + MSTR_INT_EN);
> +                       spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
> +               }
> +               break;
> +       case QSPI_DMA:
> +               if (int_status & DMA_CHAIN_DONE) {
> +                       int i;
> +
> +                       writel(0, ctrl->base + MSTR_INT_EN);
> +
> +                       for (i = 0; i < ctrl->n_cmd_desc; i++)
> +                               dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
> +                                                 ctrl->dma_cmd_desc[i]);
> +                       ctrl->n_cmd_desc = 0;
> +
> +                       ret = IRQ_HANDLED;
> +                       spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
> +               }
> +               break;
> +       default:
> +               dev_err(ctrl->dev, "Unknown xfer mode:%d", ctrl->xfer_mode);

I'm still of the opinion that you should drop xfer_mode, which means
deleting it from above.


> @@ -517,7 +742,14 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>                 return ret;
>         }
>
> +       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
> +       if (ret)
> +               return dev_err_probe(dev, ret, "could not set DMA mask\n");
> +
>         master->max_speed_hz = 300000000;
> +       master->max_dma_len = 65536; /* as per HPG */
> +       /* intimate protocal drivers about alignment requirement */

Comment above doesn't add much and is already in the comment in the
definition of the structure. Drop it.


> +       master->dma_alignment = QSPI_ALIGN_REQ;
>         master->num_chipselect = QSPI_NUM_CS;
>         master->bus_num = -1;
>         master->dev.of_node = pdev->dev.of_node;
> @@ -528,6 +760,7 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>         master->prepare_message = qcom_qspi_prepare_message;
>         master->transfer_one = qcom_qspi_transfer_one;
>         master->handle_err = qcom_qspi_handle_err;
> +       master->can_dma = qcom_qspi_can_dma;
>         master->auto_runtime_pm = true;
>
>         ret = devm_pm_opp_set_clkname(&pdev->dev, "core");
> @@ -540,6 +773,11 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>                 return ret;
>         }
>
> +       /* allocate for DMA descriptor pools */

Above comment is obvious from the name of the function you're calling.
  
Doug Anderson April 20, 2023, 6:09 p.m. UTC | #2
Hi,

On Thu, Apr 20, 2023 at 6:13 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> @@ -528,6 +760,7 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>         master->prepare_message = qcom_qspi_prepare_message;
>         master->transfer_one = qcom_qspi_transfer_one;
>         master->handle_err = qcom_qspi_handle_err;
> +       master->can_dma = qcom_qspi_can_dma;

One extra comment: it might be worth adding something like this (untested):

if (of_property_read_bool(np, "iommus"))
  master->can_dma = qcom_qspi_can_dma;

That will have the dual benefit of making old device trees work (which
we're supposed to to if possible) and also making it easier to land
these changes. Without something like that then I think transfers will
fail for anyone who pulls in the SPI tree but not the Qualcomm DT
tree.
  
Vijaya Krishna Nivarthi April 21, 2023, 4:57 p.m. UTC | #3
Hi,

Thanks a lot for the review and inputs...


On 4/20/2023 10:49 PM, Doug Anderson wrote:
> Hi,
>
> On Thu, Apr 20, 2023 at 6:13 AM Vijaya Krishna Nivarthi
> <quic_vnivarth@quicinc.com> wrote:
>> @@ -137,11 +155,29 @@ enum qspi_clocks {
>>          QSPI_NUM_CLKS
>>   };
>>
>> +enum qspi_xfer_mode {
>> +       QSPI_FIFO,
>> +       QSPI_DMA
>> +};
>> +
>> +/*
>> + * Number of entries in sgt returned from spi framework that-
>> + * will be supported. Can be modified as required.
>> + * In practice, given max_dma_len is 64KB, the number of
>> + * entries is not expected to exceed 1.
>> + */
>> +#define QSPI_MAX_SG 5
> I actually wonder if this would be more nicely done just using a
> linked list, which naturally mirrors how SGs work anyway. You'd add
> "struct list_head" to the end of "struct qspi_cmd_desc" and just store
> a pointer to the head in "struct qcom_qspi".
>
> For freeing, you can always get back the "virtual" address because
> it's just the address of each node. You can always get back the
> physical address because it's stored in "data_address".
>
Please note that in "struct qspi_cmd_desc"

data_address - dma_address of data buffer returned by spi framework

next_descriptor - dma_address of the next descriptor in chain


If we were to have a linked list of descriptors that we can parse and 
free, it would require 2 more fields

this_descriptor_dma - dma address of the current descriptor

next_descriptor_virt - virtual address of the next descriptor in chain


I tried adding same and it seemed like it was getting a little confusing.

Given the SGL is also an array in SGT, it seemed ok to retain an array, 
though it would have good to have a chain of cmd_descriptors in SW like 
in HW.

Agreed the fixed size array comes at a cost of artificial limitation of 
5 entries, which anyway seems like something we are not going to 
encounter in practice.


So for now, I retained the array.

All the other comments are addressed as recommended, except 1 change below

Please let know what you think.


Thank you,

Vijay/


>
>> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, dma_addr_t dma_ptr,
>> +                       uint32_t n_bytes)
>> +{
>> +       struct qspi_cmd_desc *virt_cmd_desc, *prev;
>> +       dma_addr_t dma_cmd_desc;
>> +
>> +       /* allocate for dma cmd descriptor */
>> +       virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
>> +               GFP_KERNEL, &dma_cmd_desc);
> Remove unnecessary cast; "void *" assigns fine w/out a cast.
>
> Add "| GFP_ZERO" and then get rid of the need to clear the "reserved"
> and "next_descriptor" stuff below.
>
I needed __GFP_ZERO to prevent a compile error, used same.
  
Doug Anderson April 21, 2023, 5:35 p.m. UTC | #4
Hi,

On Fri, Apr 21, 2023 at 9:58 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> Hi,
>
> Thanks a lot for the review and inputs...
>
>
> On 4/20/2023 10:49 PM, Doug Anderson wrote:
> > Hi,
> >
> > On Thu, Apr 20, 2023 at 6:13 AM Vijaya Krishna Nivarthi
> > <quic_vnivarth@quicinc.com> wrote:
> >> @@ -137,11 +155,29 @@ enum qspi_clocks {
> >>          QSPI_NUM_CLKS
> >>   };
> >>
> >> +enum qspi_xfer_mode {
> >> +       QSPI_FIFO,
> >> +       QSPI_DMA
> >> +};
> >> +
> >> +/*
> >> + * Number of entries in sgt returned from spi framework that-
> >> + * will be supported. Can be modified as required.
> >> + * In practice, given max_dma_len is 64KB, the number of
> >> + * entries is not expected to exceed 1.
> >> + */
> >> +#define QSPI_MAX_SG 5
> > I actually wonder if this would be more nicely done just using a
> > linked list, which naturally mirrors how SGs work anyway. You'd add
> > "struct list_head" to the end of "struct qspi_cmd_desc" and just store
> > a pointer to the head in "struct qcom_qspi".
> >
> > For freeing, you can always get back the "virtual" address because
> > it's just the address of each node. You can always get back the
> > physical address because it's stored in "data_address".
> >
> Please note that in "struct qspi_cmd_desc"
>
> data_address - dma_address of data buffer returned by spi framework
>
> next_descriptor - dma_address of the next descriptor in chain
>
>
> If we were to have a linked list of descriptors that we can parse and
> free, it would require 2 more fields
>
> this_descriptor_dma - dma address of the current descriptor

Isn't that exactly the same value as "data_address"? Sure,
"data_address" is a u32 and the DMA address is 64-bits, but elsewhere
in the code you already rely on the fact that the upper bits of the
DMA address are 0 when you do:

virt_cmd_desc->data_address = dma_ptr


> next_descriptor_virt - virtual address of the next descriptor in chain

Right, this would be the value of the next node in the linked list,
right? So basically by adding a list_node_t you can find it easily.


> >> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, dma_addr_t dma_ptr,
> >> +                       uint32_t n_bytes)
> >> +{
> >> +       struct qspi_cmd_desc *virt_cmd_desc, *prev;
> >> +       dma_addr_t dma_cmd_desc;
> >> +
> >> +       /* allocate for dma cmd descriptor */
> >> +       virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
> >> +               GFP_KERNEL, &dma_cmd_desc);
> > Remove unnecessary cast; "void *" assigns fine w/out a cast.
> >
> > Add "| GFP_ZERO" and then get rid of the need to clear the "reserved"
> > and "next_descriptor" stuff below.
> >
> I needed __GFP_ZERO to prevent a compile error, used same.

Ah, sorry. Sounds good.

-Doug
  
Vijaya Krishna Nivarthi April 21, 2023, 6:21 p.m. UTC | #5
Hi,


On 4/21/2023 11:05 PM, Doug Anderson wrote:
> Hi,
>
> On Fri, Apr 21, 2023 at 9:58 AM Vijaya Krishna Nivarthi
> <quic_vnivarth@quicinc.com> wrote:
>> Hi,
>>
>> Thanks a lot for the review and inputs...
>>
>>
>> On 4/20/2023 10:49 PM, Doug Anderson wrote:
>>> Hi,
>>>
>>> On Thu, Apr 20, 2023 at 6:13 AM Vijaya Krishna Nivarthi
>>> <quic_vnivarth@quicinc.com> wrote:
>>>> @@ -137,11 +155,29 @@ enum qspi_clocks {
>>>>           QSPI_NUM_CLKS
>>>>    };
>>>>
>>>> +enum qspi_xfer_mode {
>>>> +       QSPI_FIFO,
>>>> +       QSPI_DMA
>>>> +};
>>>> +
>>>> +/*
>>>> + * Number of entries in sgt returned from spi framework that-
>>>> + * will be supported. Can be modified as required.
>>>> + * In practice, given max_dma_len is 64KB, the number of
>>>> + * entries is not expected to exceed 1.
>>>> + */
>>>> +#define QSPI_MAX_SG 5
>>> I actually wonder if this would be more nicely done just using a
>>> linked list, which naturally mirrors how SGs work anyway. You'd add
>>> "struct list_head" to the end of "struct qspi_cmd_desc" and just store
>>> a pointer to the head in "struct qcom_qspi".
>>>
>>> For freeing, you can always get back the "virtual" address because
>>> it's just the address of each node. You can always get back the
>>> physical address because it's stored in "data_address".
>>>
>> Please note that in "struct qspi_cmd_desc"
>>
>> data_address - dma_address of data buffer returned by spi framework
>>
>> next_descriptor - dma_address of the next descriptor in chain
>>
>>
>> If we were to have a linked list of descriptors that we can parse and
>> free, it would require 2 more fields
>>
>> this_descriptor_dma - dma address of the current descriptor
> Isn't that exactly the same value as "data_address"? Sure,
> "data_address" is a u32 and the DMA address is 64-bits, but elsewhere
> in the code you already rely on the fact that the upper bits of the
> DMA address are 0 when you do:


No; data_address is the dma_address mapped to the xfer buffer.

This is provided by spi framework and retrieved from sgl.

"this_descriptor" would be the dma_address of the current cmd_descriptor.

this is returned from dma_pool_alloc()

this would be required for freeing.


> virt_cmd_desc->data_address = dma_ptr
>
>
>> next_descriptor_virt - virtual address of the next descriptor in chain
> Right, this would be the value of the next node in the linked list,
> right? So basically by adding a list_node_t you can find it easily.
>
>
>>>> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, dma_addr_t dma_ptr,
>>>> +                       uint32_t n_bytes)
>>>> +{
>>>> +       struct qspi_cmd_desc *virt_cmd_desc, *prev;
>>>> +       dma_addr_t dma_cmd_desc;
>>>> +
>>>> +       /* allocate for dma cmd descriptor */
>>>> +       virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
>>>> +               GFP_KERNEL, &dma_cmd_desc);
>>> Remove unnecessary cast; "void *" assigns fine w/out a cast.
>>>
>>> Add "| GFP_ZERO" and then get rid of the need to clear the "reserved"
>>> and "next_descriptor" stuff below.
>>>
>> I needed __GFP_ZERO to prevent a compile error, used same.
> Ah, sorry. Sounds good.
>
> -Doug
  
Doug Anderson April 21, 2023, 7:58 p.m. UTC | #6
On Fri, Apr 21, 2023 at 11:21 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> >> If we were to have a linked list of descriptors that we can parse and
> >> free, it would require 2 more fields
> >>
> >> this_descriptor_dma - dma address of the current descriptor
> > Isn't that exactly the same value as "data_address"? Sure,
> > "data_address" is a u32 and the DMA address is 64-bits, but elsewhere
> > in the code you already rely on the fact that the upper bits of the
> > DMA address are 0 when you do:
>
>
> No; data_address is the dma_address mapped to the xfer buffer.
>
> This is provided by spi framework and retrieved from sgl.
>
> "this_descriptor" would be the dma_address of the current cmd_descriptor.
>
> this is returned from dma_pool_alloc()
>
> this would be required for freeing.

Oh! Of course, that's right. So you are correct, you'd need to add
"this_descriptor_dma", but not the virtual address since that would be
the same as the address of the structure via the list_node_t. I guess
I won't insist on using a linked list even though it seems more
elegant to me. In the very least it should fall back to PIO if the
array isn't enough and if we need to change it later we always can.

-Doug
  
Vijaya Krishna Nivarthi April 24, 2023, 9:30 a.m. UTC | #7
Hi,

Thank you very much for the review and all the inputs...


On 4/22/2023 1:28 AM, Doug Anderson wrote:
> On Fri, Apr 21, 2023 at 11:21 AM Vijaya Krishna Nivarthi
> <quic_vnivarth@quicinc.com> wrote:
>>>> If we were to have a linked list of descriptors that we can parse and
>>>> free, it would require 2 more fields
>>>>
>>>> this_descriptor_dma - dma address of the current descriptor
>>> Isn't that exactly the same value as "data_address"? Sure,
>>> "data_address" is a u32 and the DMA address is 64-bits, but elsewhere
>>> in the code you already rely on the fact that the upper bits of the
>>> DMA address are 0 when you do:
>>
>> No; data_address is the dma_address mapped to the xfer buffer.
>>
>> This is provided by spi framework and retrieved from sgl.
>>
>> "this_descriptor" would be the dma_address of the current cmd_descriptor.
>>
>> this is returned from dma_pool_alloc()
>>
>> this would be required for freeing.
> Oh! Of course, that's right. So you are correct, you'd need to add
> "this_descriptor_dma", but not the virtual address since that would be
> the same as the address of the structure via the list_node_t. I guess
> I won't insist on using a linked list even though it seems more
> elegant to me. In the very least it should fall back to PIO if the
> array isn't enough and if we need to change it later we always can.
>
> -Doug


Retained the array, addressed all other comments and uploaded v5.

Conditional can_dma() and clearing interrupts in handle_err(), I thought 
were particularly helpful as they were potential problems later.

test script was very useful too.

Thank you.
  

Patch

diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index fab1553..879c293 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -2,6 +2,8 @@ 
 // Copyright (c) 2017-2018, The Linux foundation. All rights reserved.
 
 #include <linux/clk.h>
+#include <linux/dmapool.h>
+#include <linux/dma-mapping.h>
 #include <linux/interconnect.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -62,6 +64,7 @@ 
 #define WR_FIFO_FULL		BIT(10)
 #define WR_FIFO_OVERRUN		BIT(11)
 #define TRANSACTION_DONE	BIT(16)
+#define DMA_CHAIN_DONE		BIT(31)
 #define QSPI_ERR_IRQS		(RESP_FIFO_UNDERRUN | HRESP_FROM_NOC_ERR | \
 				 WR_FIFO_OVERRUN)
 #define QSPI_ALL_IRQS		(QSPI_ERR_IRQS | RESP_FIFO_RDY | \
@@ -108,6 +111,10 @@ 
 #define RD_FIFO_RESET		0x0030
 #define RESET_FIFO		BIT(0)
 
+#define NEXT_DMA_DESC_ADDR	0x0040
+#define CURRENT_DMA_DESC_ADDR	0x0044
+#define CURRENT_MEM_ADDR	0x0048
+
 #define CUR_MEM_ADDR		0x0048
 #define HW_VERSION		0x004c
 #define RD_FIFO			0x0050
@@ -120,6 +127,17 @@  enum qspi_dir {
 	QSPI_WRITE,
 };
 
+struct qspi_cmd_desc {
+	u32 data_address;
+	u32 next_descriptor;
+	u32 direction:1;
+	u32 multi_io_mode:3;
+	u32 reserved1:4;
+	u32 fragment:1;
+	u32 reserved2:7;
+	u32 length:16;
+};
+
 struct qspi_xfer {
 	union {
 		const void *tx_buf;
@@ -137,11 +155,29 @@  enum qspi_clocks {
 	QSPI_NUM_CLKS
 };
 
+enum qspi_xfer_mode {
+	QSPI_FIFO,
+	QSPI_DMA
+};
+
+/*
+ * Number of entries in sgt returned from spi framework that-
+ * will be supported. Can be modified as required.
+ * In practice, given max_dma_len is 64KB, the number of
+ * entries is not expected to exceed 1.
+ */
+#define QSPI_MAX_SG 5
+
 struct qcom_qspi {
 	void __iomem *base;
 	struct device *dev;
 	struct clk_bulk_data *clks;
 	struct qspi_xfer xfer;
+	struct dma_pool *dma_cmd_pool;
+	dma_addr_t dma_cmd_desc[QSPI_MAX_SG];
+	void *virt_cmd_desc[QSPI_MAX_SG];
+	unsigned int n_cmd_desc;
+	enum qspi_xfer_mode xfer_mode;
 	struct icc_path *icc_path_cpu_to_qspi;
 	unsigned long last_speed;
 	/* Lock to protect data accessed by IRQs */
@@ -153,21 +189,22 @@  static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
 {
 	switch (buswidth) {
 	case 1:
-		return SDR_1BIT << MULTI_IO_MODE_SHFT;
+		return SDR_1BIT;
 	case 2:
-		return SDR_2BIT << MULTI_IO_MODE_SHFT;
+		return SDR_2BIT;
 	case 4:
-		return SDR_4BIT << MULTI_IO_MODE_SHFT;
+		return SDR_4BIT;
 	default:
 		dev_warn_once(ctrl->dev,
 				"Unexpected bus width: %u\n", buswidth);
-		return SDR_1BIT << MULTI_IO_MODE_SHFT;
+		return SDR_1BIT;
 	}
 }
 
 static void qcom_qspi_pio_xfer_cfg(struct qcom_qspi *ctrl)
 {
 	u32 pio_xfer_cfg;
+	u32 iomode;
 	const struct qspi_xfer *xfer;
 
 	xfer = &ctrl->xfer;
@@ -179,7 +216,8 @@  static void qcom_qspi_pio_xfer_cfg(struct qcom_qspi *ctrl)
 	else
 		pio_xfer_cfg |= TRANSFER_FRAGMENT;
 	pio_xfer_cfg &= ~MULTI_IO_MODE_MSK;
-	pio_xfer_cfg |= qspi_buswidth_to_iomode(ctrl, xfer->buswidth);
+	iomode = qspi_buswidth_to_iomode(ctrl, xfer->buswidth);
+	pio_xfer_cfg |= iomode << MULTI_IO_MODE_SHFT;
 
 	writel(pio_xfer_cfg, ctrl->base + PIO_XFER_CFG);
 }
@@ -223,6 +261,16 @@  static void qcom_qspi_handle_err(struct spi_master *master,
 	spin_lock_irqsave(&ctrl->lock, flags);
 	writel(0, ctrl->base + MSTR_INT_EN);
 	ctrl->xfer.rem_bytes = 0;
+
+	if (ctrl->xfer_mode == QSPI_DMA) {
+		int i;
+
+		/* free cmd descriptors */
+		for (i = 0; i < ctrl->n_cmd_desc; i++)
+			dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+					  ctrl->dma_cmd_desc[i]);
+		ctrl->n_cmd_desc = 0;
+	}
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 }
 
@@ -242,7 +290,7 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 	}
 
 	/*
-	 * Set BW quota for CPU as driver supports FIFO mode only.
+	 * Set BW quota for CPU.
 	 * We don't have explicit peak requirement so keep it equal to avg_bw.
 	 */
 	avg_bw_cpu = Bps_to_icc(speed_hz);
@@ -258,6 +306,120 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 	return 0;
 }
 
+#define QSPI_ALIGN_REQ	32
+
+static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, dma_addr_t dma_ptr,
+			uint32_t n_bytes)
+{
+	struct qspi_cmd_desc *virt_cmd_desc, *prev;
+	dma_addr_t dma_cmd_desc;
+
+	/* allocate for dma cmd descriptor */
+	virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
+		GFP_KERNEL, &dma_cmd_desc);
+	if (!virt_cmd_desc) {
+		dev_err(ctrl->dev,
+			"Could not allocate for cmd_desc\n");
+		return -ENOMEM;
+	}
+	ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
+	ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
+	ctrl->n_cmd_desc++;
+
+	/* setup cmd descriptor */
+	virt_cmd_desc->data_address = dma_ptr;
+	virt_cmd_desc->next_descriptor = 0;
+	virt_cmd_desc->direction = ctrl->xfer.dir;
+	virt_cmd_desc->multi_io_mode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
+	virt_cmd_desc->reserved1 = 0;
+	virt_cmd_desc->fragment = ctrl->xfer.is_last ? 0 : 1;
+	virt_cmd_desc->reserved2 = 0;
+	virt_cmd_desc->length = n_bytes;
+
+	/* update previous descriptor */
+	if (ctrl->n_cmd_desc >= 2) {
+		prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
+		prev->next_descriptor = dma_cmd_desc;
+		prev->fragment = 1;
+	}
+
+	return 0;
+}
+
+static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
+				struct spi_transfer *xfer)
+{
+	int ret;
+	struct sg_table *sgt;
+	unsigned int sg_total_len = 0;
+	dma_addr_t dma_ptr_sg;
+	unsigned int dma_len_sg;
+	int i;
+
+	if (ctrl->n_cmd_desc) {
+		dev_err(ctrl->dev, "Remnant dma buffers n_cmd_desc-%d\n", ctrl->n_cmd_desc);
+		return -EIO;
+	}
+
+	sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
+	if (!sgt->nents || sgt->nents > QSPI_MAX_SG) {
+		dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
+		return -EAGAIN;
+	}
+
+	for (i = 0; i < sgt->nents; i++) {
+		dma_ptr_sg = sg_dma_address(sgt->sgl + i);
+		if (!IS_ALIGNED(dma_ptr_sg, QSPI_ALIGN_REQ)) {
+			dev_err(ctrl->dev, "dma address-%pad not aligned to %d\n",
+				&dma_ptr_sg, QSPI_ALIGN_REQ);
+			return -EAGAIN;
+		}
+		sg_total_len += sg_dma_len(sgt->sgl + i);
+	}
+
+	if (sg_total_len != xfer->len) {
+		dev_err(ctrl->dev, "Data lengths mismatch\n");
+		return -EAGAIN;
+	}
+
+	for (i = 0; i < sgt->nents; i++) {
+		dma_ptr_sg = sg_dma_address(sgt->sgl + i);
+		dma_len_sg = sg_dma_len(sgt->sgl + i);
+
+		ret = qcom_qspi_alloc_desc(ctrl, dma_ptr_sg, dma_len_sg);
+		if (ret)
+			goto cleanup;
+	}
+	return 0;
+
+cleanup:
+	dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
+
+	for (i = 0; i < ctrl->n_cmd_desc; i++)
+		dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+				  ctrl->dma_cmd_desc[i]);
+	ctrl->n_cmd_desc = 0;
+	return ret;
+}
+
+static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
+{
+	/* Setup new interrupts */
+	writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
+
+	/* kick off transfer */
+	writel((u32)((ctrl->dma_cmd_desc)[0]), ctrl->base + NEXT_DMA_DESC_ADDR);
+}
+
+/* Switch to DMA if transfer length exceeds this */
+#define QSPI_MAX_BYTES_FIFO 64
+
+static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
+			 struct spi_device *slv, struct spi_transfer *xfer)
+{
+	return xfer->len > QSPI_MAX_BYTES_FIFO;
+}
+
 static int qcom_qspi_transfer_one(struct spi_master *master,
 				  struct spi_device *slv,
 				  struct spi_transfer *xfer)
@@ -266,6 +428,7 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 	int ret;
 	unsigned long speed_hz;
 	unsigned long flags;
+	u32 mstr_cfg;
 
 	speed_hz = slv->max_speed_hz;
 	if (xfer->speed_hz)
@@ -276,6 +439,7 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 		return ret;
 
 	spin_lock_irqsave(&ctrl->lock, flags);
+	mstr_cfg = readl(ctrl->base + MSTR_CONFIG);
 
 	/* We are half duplex, so either rx or tx will be set */
 	if (xfer->rx_buf) {
@@ -290,8 +454,37 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 	ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
 					  &master->cur_msg->transfers);
 	ctrl->xfer.rem_bytes = xfer->len;
+
+	if (xfer->rx_sg.nents || xfer->tx_sg.nents) {
+		/* do DMA transfer */
+		ctrl->xfer_mode = QSPI_DMA;
+		if (!(mstr_cfg & DMA_ENABLE)) {
+			mstr_cfg |= DMA_ENABLE;
+			writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+		}
+
+		ret = qcom_qspi_setup_dma_desc(ctrl, xfer);
+		if (ret) {
+			if (ret == -EAGAIN) {
+				dev_err_once(ctrl->dev, "DMA failure, falling back to PIO");
+				goto do_pio;
+			}
+			spin_unlock_irqrestore(&ctrl->lock, flags);
+			return ret;
+		}
+		qcom_qspi_dma_xfer(ctrl);
+		goto end;
+	}
+
+do_pio:
+	ctrl->xfer_mode = QSPI_FIFO;
+	if (mstr_cfg & DMA_ENABLE) {
+		mstr_cfg &= ~DMA_ENABLE;
+		writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+	}
 	qcom_qspi_pio_xfer(ctrl);
 
+end:
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 
 	/* We'll call spi_finalize_current_transfer() when done */
@@ -328,6 +521,17 @@  static int qcom_qspi_prepare_message(struct spi_master *master,
 	return 0;
 }
 
+static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
+{
+	/* allocate for cmd descriptors pool */
+	ctrl->dma_cmd_pool = dmam_pool_create("qspi cmd desc pool",
+		ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);
+	if (!ctrl->dma_cmd_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static irqreturn_t pio_read(struct qcom_qspi *ctrl)
 {
 	u32 rd_fifo_status;
@@ -426,27 +630,48 @@  static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
 	int_status = readl(ctrl->base + MSTR_INT_STATUS);
 	writel(int_status, ctrl->base + MSTR_INT_STATUS);
 
-	if (ctrl->xfer.dir == QSPI_WRITE) {
-		if (int_status & WR_FIFO_EMPTY)
-			ret = pio_write(ctrl);
-	} else {
-		if (int_status & RESP_FIFO_RDY)
-			ret = pio_read(ctrl);
-	}
-
-	if (int_status & QSPI_ERR_IRQS) {
-		if (int_status & RESP_FIFO_UNDERRUN)
-			dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
-		if (int_status & WR_FIFO_OVERRUN)
-			dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
-		if (int_status & HRESP_FROM_NOC_ERR)
-			dev_err(ctrl->dev, "IRQ error: NOC response error\n");
-		ret = IRQ_HANDLED;
-	}
-
-	if (!ctrl->xfer.rem_bytes) {
-		writel(0, ctrl->base + MSTR_INT_EN);
-		spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+	switch (ctrl->xfer_mode) {
+	case QSPI_FIFO:
+		if (ctrl->xfer.dir == QSPI_WRITE) {
+			if (int_status & WR_FIFO_EMPTY)
+				ret = pio_write(ctrl);
+		} else {
+			if (int_status & RESP_FIFO_RDY)
+				ret = pio_read(ctrl);
+		}
+
+		if (int_status & QSPI_ERR_IRQS) {
+			if (int_status & RESP_FIFO_UNDERRUN)
+				dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
+			if (int_status & WR_FIFO_OVERRUN)
+				dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
+			if (int_status & HRESP_FROM_NOC_ERR)
+				dev_err(ctrl->dev, "IRQ error: NOC response error\n");
+			ret = IRQ_HANDLED;
+		}
+
+		if (!ctrl->xfer.rem_bytes) {
+			writel(0, ctrl->base + MSTR_INT_EN);
+			spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+		}
+		break;
+	case QSPI_DMA:
+		if (int_status & DMA_CHAIN_DONE) {
+			int i;
+
+			writel(0, ctrl->base + MSTR_INT_EN);
+
+			for (i = 0; i < ctrl->n_cmd_desc; i++)
+				dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+						  ctrl->dma_cmd_desc[i]);
+			ctrl->n_cmd_desc = 0;
+
+			ret = IRQ_HANDLED;
+			spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+		}
+		break;
+	default:
+		dev_err(ctrl->dev, "Unknown xfer mode:%d", ctrl->xfer_mode);
 	}
 
 	spin_unlock(&ctrl->lock);
@@ -517,7 +742,14 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+	if (ret)
+		return dev_err_probe(dev, ret, "could not set DMA mask\n");
+
 	master->max_speed_hz = 300000000;
+	master->max_dma_len = 65536; /* as per HPG */
+	/* intimate protocal drivers about alignment requirement */
+	master->dma_alignment = QSPI_ALIGN_REQ;
 	master->num_chipselect = QSPI_NUM_CS;
 	master->bus_num = -1;
 	master->dev.of_node = pdev->dev.of_node;
@@ -528,6 +760,7 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 	master->prepare_message = qcom_qspi_prepare_message;
 	master->transfer_one = qcom_qspi_transfer_one;
 	master->handle_err = qcom_qspi_handle_err;
+	master->can_dma = qcom_qspi_can_dma;
 	master->auto_runtime_pm = true;
 
 	ret = devm_pm_opp_set_clkname(&pdev->dev, "core");
@@ -540,6 +773,11 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	/* allocate for DMA descriptor pools */
+	ret = qcom_qspi_alloc_dma(ctrl);
+	if (ret)
+		return ret;
+
 	pm_runtime_use_autosuspend(dev);
 	pm_runtime_set_autosuspend_delay(dev, 250);
 	pm_runtime_enable(dev);