usb(4): use cacheable buffers for data transfers (massive speedup)

classic Classic list List threaded Threaded
11 messages Options
Reply | Threaded
Open this post in threaded view
|

usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
Hi,

I've spent a few days investigating why USB ethernet adapters are so
horribly slow on my ARMs.  Using dt(4) I realized that it was spending
most of its time in memcpy.  But, why?  As it turns out, all USB data
buffers are mapped COHERENT, which on some/most ARMs means uncached.
Using cached data buffers makes the performance rise from 20 mbit/s to
200 mbit/s.  Quite a difference.

sys/dev/usb/usb_mem.c:
        error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
                               &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);

On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
Some SoCs have cache-coherent USB controllers, some don't.  Mine does
not, so mapping it COHERENT means uncached and thus slow.

Why do we do that?  Well, when the code was imported in 99, it was
already there.  Since then we have gained infrastructure for DMA
syncs in the USB stack, which I think are proper.

sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)

        if (!usbd_xfer_isread(xfer)) {
                if ((xfer->flags & USBD_NO_COPY) == 0)
                        memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
                            xfer->length);
                usb_syncmem(&xfer->dmabuf, 0, xfer->length,
                    BUS_DMASYNC_PREWRITE);
        } else
                usb_syncmem(&xfer->dmabuf, 0, xfer->length,
                    BUS_DMASYNC_PREREAD);
        err = pipe->methods->transfer(xfer);

sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)

        if (xfer->actlen != 0) {
                if (usbd_xfer_isread(xfer)) {
                        usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
                            BUS_DMASYNC_POSTREAD);
                        if (!(xfer->flags & USBD_NO_COPY))
                                memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
                                    xfer->actlen);
                } else
                        usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
                            BUS_DMASYNC_POSTWRITE);
        }

We cannot just remove COHERENT, since some drivers, like ehci(4), use
the same backend to allocate their rings.  And I can't vouch for those
drivers' sanity.

As a first step, I would like to go ahead with another solution, which
is based on a diff from Marius Strobl, who added those syncs in the
first place.  Essentially it splits the memory handling into cacheable
and non-cacheable blocks.  The USB data transfers and everyone who uses
usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
still don't.  This is a bit of a safer approach imho, since we don't
hurt the controller drivers, but speed up the data buffers.

Once we have verified that there are no regressions, we can adjust
ehci(4) and the like, add proper syncs, make sure they still work as
well as before, and maybe then back this out again.

Keep note that this is all a no-op on X86, but all the other archs will
profit from this.

ok?

Patrick

diff --git a/sys/dev/usb/usb_mem.c b/sys/dev/usb/usb_mem.c
index c65906b43f4..95993093b5a 100644
--- a/sys/dev/usb/usb_mem.c
+++ b/sys/dev/usb/usb_mem.c
@@ -72,7 +72,7 @@ struct usb_frag_dma {
 };
 
 usbd_status usb_block_allocmem(bus_dma_tag_t, size_t, size_t,
-    struct usb_dma_block **);
+    struct usb_dma_block **, int);
 void usb_block_freemem(struct usb_dma_block *);
 
 LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
 
 usbd_status
 usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
-    struct usb_dma_block **dmap)
+    struct usb_dma_block **dmap, int cacheable)
 {
  int error;
         struct usb_dma_block *p;
@@ -96,7 +96,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  s = splusb();
  /* First check the free list. */
  for (p = LIST_FIRST(&usb_blk_freelist); p; p = LIST_NEXT(p, next)) {
- if (p->tag == tag && p->size >= size && p->align >= align) {
+ if (p->tag == tag && p->size >= size && p->align >= align &&
+    p->cacheable == cacheable) {
  LIST_REMOVE(p, next);
  usb_blk_nfree--;
  splx(s);
@@ -116,6 +117,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  p->tag = tag;
  p->size = size;
  p->align = align;
+ p->cacheable = cacheable;
  error = bus_dmamem_alloc(tag, p->size, align, 0,
  p->segs, nitems(p->segs),
  &p->nsegs, BUS_DMA_NOWAIT);
@@ -123,7 +125,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  goto free0;
 
  error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
-       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
+       &p->kaddr, BUS_DMA_NOWAIT | (cacheable ? 0 :
+       BUS_DMA_COHERENT));
  if (error)
  goto free1;
 
@@ -187,14 +190,18 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, struct usb_dma *p)
  usbd_status err;
  struct usb_frag_dma *f;
  struct usb_dma_block *b;
+ int cacheable;
  int i;
  int s;
 
+ cacheable = !!(p->flags & USB_DMA_XFER_BUFFER);
+
  /* If the request is large then just use a full block. */
  if (size > USB_MEM_SMALL || align > USB_MEM_SMALL) {
  DPRINTFN(1, ("%s: large alloc %d\n", __func__, (int)size));
  size = (size + USB_MEM_BLOCK - 1) & ~(USB_MEM_BLOCK - 1);
- err = usb_block_allocmem(tag, size, align, &p->block);
+ err = usb_block_allocmem(tag, size, align, &p->block,
+    cacheable);
  if (!err) {
  p->block->frags = NULL;
  p->offs = 0;
@@ -205,11 +212,12 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, struct usb_dma *p)
  s = splusb();
  /* Check for free fragments. */
  for (f = LIST_FIRST(&usb_frag_freelist); f; f = LIST_NEXT(f, next))
- if (f->block->tag == tag)
+ if (f->block->tag == tag && f->block->cacheable == cacheable)
  break;
  if (f == NULL) {
  DPRINTFN(1, ("usb_allocmem: adding fragments\n"));
- err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL,&b);
+ err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL, &b,
+    cacheable);
  if (err) {
  splx(s);
  return (err);
diff --git a/sys/dev/usb/usb_mem.h b/sys/dev/usb/usb_mem.h
index 1ae933f4bd9..f4d0bb85bbd 100644
--- a/sys/dev/usb/usb_mem.h
+++ b/sys/dev/usb/usb_mem.h
@@ -40,6 +40,7 @@ struct usb_dma_block {
         caddr_t kaddr;
         bus_dma_segment_t segs[1];
         int nsegs;
+ int cacheable;
         size_t size;
         size_t align;
  struct usb_frag_dma *frags;
diff --git a/sys/dev/usb/usbdi.c b/sys/dev/usb/usbdi.c
index aba1220103f..5f750a2bbcb 100644
--- a/sys/dev/usb/usbdi.c
+++ b/sys/dev/usb/usbdi.c
@@ -305,6 +305,7 @@ usbd_transfer(struct usbd_xfer *xfer)
  if (xfer->rqflags & URQ_AUTO_DMABUF)
  printf("usbd_transfer: has old buffer!\n");
 #endif
+ xfer->dmabuf.flags |= USB_DMA_XFER_BUFFER;
  err = usb_allocmem(bus, xfer->length, 0, &xfer->dmabuf);
  if (err)
  return (err);
@@ -389,6 +390,7 @@ usbd_alloc_buffer(struct usbd_xfer *xfer, u_int32_t size)
  if (xfer->rqflags & (URQ_DEV_DMABUF | URQ_AUTO_DMABUF))
  printf("usbd_alloc_buffer: xfer already has a buffer\n");
 #endif
+ xfer->dmabuf.flags |= USB_DMA_XFER_BUFFER;
  err = usb_allocmem(bus, size, 0, &xfer->dmabuf);
  if (err)
  return (NULL);
diff --git a/sys/dev/usb/usbdivar.h b/sys/dev/usb/usbdivar.h
index 6600c402979..11a6d52a0f9 100644
--- a/sys/dev/usb/usbdivar.h
+++ b/sys/dev/usb/usbdivar.h
@@ -47,6 +47,8 @@ struct usb_dma_block;
 struct usb_dma {
  struct usb_dma_block *block;
  u_int offs;
+ int flags;
+#define USB_DMA_XFER_BUFFER 0x01
 };
 
 struct usbd_xfer;

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:

> Hi,
>
> I've spent a few days investigating why USB ethernet adapters are so
> horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> most of its time in memcpy.  But, why?  As it turns out, all USB data
> buffers are mapped COHERENT, which on some/most ARMs means uncached.
> Using cached data buffers makes the performance rise from 20 mbit/s to
> 200 mbit/s.  Quite a difference.
>
> sys/dev/usb/usb_mem.c:
> error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
>       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
>
> On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> not, so mapping it COHERENT means uncached and thus slow.
>
> Why do we do that?  Well, when the code was imported in 99, it was
> already there.  Since then we have gained infrastructure for DMA
> syncs in the USB stack, which I think are proper.
>
> sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
>
> if (!usbd_xfer_isread(xfer)) {
> if ((xfer->flags & USBD_NO_COPY) == 0)
> memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
>    xfer->length);
> usb_syncmem(&xfer->dmabuf, 0, xfer->length,
>    BUS_DMASYNC_PREWRITE);
> } else
> usb_syncmem(&xfer->dmabuf, 0, xfer->length,
>    BUS_DMASYNC_PREREAD);
> err = pipe->methods->transfer(xfer);
>
> sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
>
> if (xfer->actlen != 0) {
> if (usbd_xfer_isread(xfer)) {
> usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
>    BUS_DMASYNC_POSTREAD);
> if (!(xfer->flags & USBD_NO_COPY))
> memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
>    xfer->actlen);
> } else
> usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
>    BUS_DMASYNC_POSTWRITE);
> }
>
> We cannot just remove COHERENT, since some drivers, like ehci(4), use
> the same backend to allocate their rings.  And I can't vouch for those
> drivers' sanity.
>
> As a first step, I would like to go ahead with another solution, which
> is based on a diff from Marius Strobl, who added those syncs in the
> first place.  Essentially it splits the memory handling into cacheable
> and non-cacheable blocks.  The USB data transfers and everyone who uses
> usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> still don't.  This is a bit of a safer approach imho, since we don't
> hurt the controller drivers, but speed up the data buffers.
>
> Once we have verified that there are no regressions, we can adjust
> ehci(4) and the like, add proper syncs, make sure they still work as
> well as before, and maybe then back this out again.
>
> Keep note that this is all a no-op on X86, but all the other archs will
> profit from this.
>
> ok?
>
> Patrick

Update diff with inverted logic.  kettenis@ argues that we should
invert the logic, and those who need COHERENT memory should ask
for that explicitly, since for bus_dmamem_map() it also needs to
be passed explicitly.  This also points out all those users that
use usb_allocmem() internally, where we might want to have a look
if COHERENT is actually needed or not, or if it can be refactored
in another way.

diff --git a/sys/dev/usb/dwc2/dwc2.c b/sys/dev/usb/dwc2/dwc2.c
index 6f035467213..099dfa26da1 100644
--- a/sys/dev/usb/dwc2/dwc2.c
+++ b/sys/dev/usb/dwc2/dwc2.c
@@ -473,6 +473,7 @@ dwc2_open(struct usbd_pipe *pipe)
  switch (xfertype) {
  case UE_CONTROL:
  pipe->methods = &dwc2_device_ctrl_methods;
+ dpipe->req_dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
     0, &dpipe->req_dma);
  if (err)
diff --git a/sys/dev/usb/dwc2/dwc2_hcd.c b/sys/dev/usb/dwc2/dwc2_hcd.c
index 7e5c91481d5..d44e3196e61 100644
--- a/sys/dev/usb/dwc2/dwc2_hcd.c
+++ b/sys/dev/usb/dwc2/dwc2_hcd.c
@@ -679,6 +679,7 @@ STATIC int dwc2_hc_setup_align_buf(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh,
 
  qh->dw_align_buf = NULL;
  qh->dw_align_buf_dma = 0;
+ qh->dw_align_buf_usbdma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, buf_size, buf_size,
    &qh->dw_align_buf_usbdma);
  if (!err) {
@@ -2267,6 +2268,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg,
  */
  hsotg->status_buf = NULL;
  if (hsotg->core_params->dma_enable > 0) {
+ hsotg->status_buf_usbdma.flags |= USB_DMA_COHERENT;
  retval = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
       DWC2_HCD_STATUS_BUF_SIZE, 0,
       &hsotg->status_buf_usbdma);
diff --git a/sys/dev/usb/dwc2/dwc2_hcdddma.c b/sys/dev/usb/dwc2/dwc2_hcdddma.c
index d8584eed50d..72da68e5667 100644
--- a/sys/dev/usb/dwc2/dwc2_hcdddma.c
+++ b/sys/dev/usb/dwc2/dwc2_hcdddma.c
@@ -102,6 +102,7 @@ STATIC int dwc2_desc_list_alloc(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh,
  //KASSERT(!cpu_intr_p() && !cpu_softintr_p());
 
  qh->desc_list = NULL;
+ qh->desc_list_usbdma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
     sizeof(struct dwc2_hcd_dma_desc) * dwc2_max_desc_num(qh), 0,
     &qh->desc_list_usbdma);
@@ -143,6 +144,7 @@ STATIC int dwc2_frame_list_alloc(struct dwc2_hsotg *hsotg, gfp_t mem_flags)
 
  /* XXXNH - struct pool */
  hsotg->frame_list = NULL;
+ hsotg->frame_list_usbdma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, 4 * FRLISTEN_64_SIZE,
     0, &hsotg->frame_list_usbdma);
 
diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
index ddde2796409..1bb6698cbcc 100644
--- a/sys/dev/usb/ehci.c
+++ b/sys/dev/usb/ehci.c
@@ -355,6 +355,7 @@ ehci_init(struct ehci_softc *sc)
  case 3:
  return (USBD_IOERROR);
  }
+ sc->sc_fldma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, sc->sc_flsize * sizeof(ehci_link_t),
     EHCI_FLALIGN_ALIGN, &sc->sc_fldma);
  if (err)
@@ -1469,6 +1470,7 @@ ehci_open(struct usbd_pipe *pipe)
 
  switch (xfertype) {
  case UE_CONTROL:
+ epipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
     0, &epipe->u.ctl.reqdma);
  if (err) {
@@ -2257,6 +2259,7 @@ ehci_alloc_sqh(struct ehci_softc *sc)
  s = splusb();
  if (sc->sc_freeqhs == NULL) {
  DPRINTFN(2, ("ehci_alloc_sqh: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, EHCI_SQH_SIZE * EHCI_SQH_CHUNK,
     EHCI_PAGE_SIZE, &dma);
  if (err)
@@ -2305,6 +2308,7 @@ ehci_alloc_sqtd(struct ehci_softc *sc)
  s = splusb();
  if (sc->sc_freeqtds == NULL) {
  DPRINTFN(2, ("ehci_alloc_sqtd: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, EHCI_SQTD_SIZE*EHCI_SQTD_CHUNK,
     EHCI_PAGE_SIZE, &dma);
  if (err)
@@ -2533,6 +2537,7 @@ ehci_alloc_itd(struct ehci_softc *sc)
  }
 
  if (freeitd == NULL) {
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, EHCI_ITD_SIZE * EHCI_ITD_CHUNK,
     EHCI_PAGE_SIZE, &dma);
  if (err) {
diff --git a/sys/dev/usb/ohci.c b/sys/dev/usb/ohci.c
index ad3a9811a9c..022286380dc 100644
--- a/sys/dev/usb/ohci.c
+++ b/sys/dev/usb/ohci.c
@@ -394,6 +394,7 @@ ohci_alloc_sed(struct ohci_softc *sc)
  s = splusb();
  if (sc->sc_freeeds == NULL) {
  DPRINTFN(2, ("ohci_alloc_sed: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, OHCI_SED_SIZE * OHCI_SED_CHUNK,
   OHCI_ED_ALIGN, &dma);
  if (err)
@@ -439,6 +440,7 @@ ohci_alloc_std(struct ohci_softc *sc)
  s = splusb();
  if (sc->sc_freetds == NULL) {
  DPRINTFN(2, ("ohci_alloc_std: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, OHCI_STD_SIZE * OHCI_STD_CHUNK,
   OHCI_TD_ALIGN, &dma);
  if (err)
@@ -597,6 +599,7 @@ ohci_alloc_sitd(struct ohci_softc *sc)
 
  if (sc->sc_freeitds == NULL) {
  DPRINTFN(2, ("ohci_alloc_sitd: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, OHCI_SITD_SIZE * OHCI_SITD_CHUNK,
   OHCI_ITD_ALIGN, &dma);
  if (err)
@@ -728,6 +731,7 @@ ohci_init(struct ohci_softc *sc)
 
  /* XXX determine alignment by R/W */
  /* Allocate the HCCA area. */
+ sc->sc_hccadma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, OHCI_HCCA_SIZE,
  OHCI_HCCA_ALIGN, &sc->sc_hccadma);
  if (err)
@@ -1931,6 +1935,7 @@ ohci_open(struct usbd_pipe *pipe)
  switch (xfertype) {
  case UE_CONTROL:
  pipe->methods = &ohci_device_ctrl_methods;
+ opipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus,
   sizeof(usb_device_request_t),
   0, &opipe->u.ctl.reqdma);
diff --git a/sys/dev/usb/uhci.c b/sys/dev/usb/uhci.c
index 47e6d1678c0..c3e0e15856a 100644
--- a/sys/dev/usb/uhci.c
+++ b/sys/dev/usb/uhci.c
@@ -377,6 +377,7 @@ uhci_init(struct uhci_softc *sc)
  UWRITE1(sc, UHCI_SOF, sc->sc_saved_sof);
 
  /* Allocate and initialize real frame array. */
+ sc->sc_dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus,
   UHCI_FRAMELIST_COUNT * sizeof(uhci_physaddr_t),
   UHCI_FRAMELIST_ALIGN, &sc->sc_dma);
@@ -1414,6 +1415,7 @@ uhci_alloc_std(struct uhci_softc *sc)
  s = splusb();
  if (sc->sc_freetds == NULL) {
  DPRINTFN(2,("uhci_alloc_std: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, UHCI_STD_SIZE * UHCI_STD_CHUNK,
   UHCI_TD_ALIGN, &dma);
  if (err)
@@ -1468,6 +1470,7 @@ uhci_alloc_sqh(struct uhci_softc *sc)
  s = splusb();
  if (sc->sc_freeqhs == NULL) {
  DPRINTFN(2, ("uhci_alloc_sqh: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus, UHCI_SQH_SIZE * UHCI_SQH_CHUNK,
   UHCI_QH_ALIGN, &dma);
  if (err)
@@ -2650,6 +2653,7 @@ uhci_open(struct usbd_pipe *pipe)
  uhci_free_std(sc, upipe->u.ctl.setup);
  goto bad;
  }
+ upipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
  err = usb_allocmem(&sc->sc_bus,
   sizeof(usb_device_request_t),
   0, &upipe->u.ctl.reqdma);
diff --git a/sys/dev/usb/usb_mem.c b/sys/dev/usb/usb_mem.c
index c65906b43f4..a2b8a3c10be 100644
--- a/sys/dev/usb/usb_mem.c
+++ b/sys/dev/usb/usb_mem.c
@@ -72,7 +72,7 @@ struct usb_frag_dma {
 };
 
 usbd_status usb_block_allocmem(bus_dma_tag_t, size_t, size_t,
-    struct usb_dma_block **);
+    struct usb_dma_block **, int);
 void usb_block_freemem(struct usb_dma_block *);
 
 LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
 
 usbd_status
 usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
-    struct usb_dma_block **dmap)
+    struct usb_dma_block **dmap, int coherent)
 {
  int error;
         struct usb_dma_block *p;
@@ -96,7 +96,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  s = splusb();
  /* First check the free list. */
  for (p = LIST_FIRST(&usb_blk_freelist); p; p = LIST_NEXT(p, next)) {
- if (p->tag == tag && p->size >= size && p->align >= align) {
+ if (p->tag == tag && p->size >= size && p->align >= align &&
+    p->coherent == coherent) {
  LIST_REMOVE(p, next);
  usb_blk_nfree--;
  splx(s);
@@ -116,6 +117,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  p->tag = tag;
  p->size = size;
  p->align = align;
+ p->coherent = coherent;
  error = bus_dmamem_alloc(tag, p->size, align, 0,
  p->segs, nitems(p->segs),
  &p->nsegs, BUS_DMA_NOWAIT);
@@ -123,7 +125,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  goto free0;
 
  error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
-       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
+       &p->kaddr, BUS_DMA_NOWAIT | (coherent ?
+       BUS_DMA_COHERENT : 0));
  if (error)
  goto free1;
 
@@ -187,14 +190,18 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, struct usb_dma *p)
  usbd_status err;
  struct usb_frag_dma *f;
  struct usb_dma_block *b;
+ int coherent;
  int i;
  int s;
 
+ coherent = !!(p->flags & USB_DMA_COHERENT);
+
  /* If the request is large then just use a full block. */
  if (size > USB_MEM_SMALL || align > USB_MEM_SMALL) {
  DPRINTFN(1, ("%s: large alloc %d\n", __func__, (int)size));
  size = (size + USB_MEM_BLOCK - 1) & ~(USB_MEM_BLOCK - 1);
- err = usb_block_allocmem(tag, size, align, &p->block);
+ err = usb_block_allocmem(tag, size, align, &p->block,
+    coherent);
  if (!err) {
  p->block->frags = NULL;
  p->offs = 0;
@@ -205,11 +212,12 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, struct usb_dma *p)
  s = splusb();
  /* Check for free fragments. */
  for (f = LIST_FIRST(&usb_frag_freelist); f; f = LIST_NEXT(f, next))
- if (f->block->tag == tag)
+ if (f->block->tag == tag && f->block->coherent == coherent)
  break;
  if (f == NULL) {
  DPRINTFN(1, ("usb_allocmem: adding fragments\n"));
- err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL,&b);
+ err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL, &b,
+    coherent);
  if (err) {
  splx(s);
  return (err);
diff --git a/sys/dev/usb/usb_mem.h b/sys/dev/usb/usb_mem.h
index 1ae933f4bd9..7d490dd4833 100644
--- a/sys/dev/usb/usb_mem.h
+++ b/sys/dev/usb/usb_mem.h
@@ -40,6 +40,7 @@ struct usb_dma_block {
         caddr_t kaddr;
         bus_dma_segment_t segs[1];
         int nsegs;
+ int coherent;
         size_t size;
         size_t align;
  struct usb_frag_dma *frags;
diff --git a/sys/dev/usb/usbdivar.h b/sys/dev/usb/usbdivar.h
index 6600c402979..102ec2f1af4 100644
--- a/sys/dev/usb/usbdivar.h
+++ b/sys/dev/usb/usbdivar.h
@@ -47,6 +47,8 @@ struct usb_dma_block;
 struct usb_dma {
  struct usb_dma_block *block;
  u_int offs;
+ int flags;
+#define USB_DMA_COHERENT (1 << 0)
 };
 
 struct usbd_xfer;

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Jonathan Gray-11
On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:

> On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > Hi,
> >
> > I've spent a few days investigating why USB ethernet adapters are so
> > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > Using cached data buffers makes the performance rise from 20 mbit/s to
> > 200 mbit/s.  Quite a difference.
> >
> > sys/dev/usb/usb_mem.c:
> > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> >
> > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > not, so mapping it COHERENT means uncached and thus slow.
> >
> > Why do we do that?  Well, when the code was imported in 99, it was
> > already there.  Since then we have gained infrastructure for DMA
> > syncs in the USB stack, which I think are proper.
> >
> > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> >
> > if (!usbd_xfer_isread(xfer)) {
> > if ((xfer->flags & USBD_NO_COPY) == 0)
> > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> >    xfer->length);
> > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> >    BUS_DMASYNC_PREWRITE);
> > } else
> > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> >    BUS_DMASYNC_PREREAD);
> > err = pipe->methods->transfer(xfer);
> >
> > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> >
> > if (xfer->actlen != 0) {
> > if (usbd_xfer_isread(xfer)) {
> > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> >    BUS_DMASYNC_POSTREAD);
> > if (!(xfer->flags & USBD_NO_COPY))
> > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> >    xfer->actlen);
> > } else
> > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> >    BUS_DMASYNC_POSTWRITE);
> > }
> >
> > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > the same backend to allocate their rings.  And I can't vouch for those
> > drivers' sanity.
> >
> > As a first step, I would like to go ahead with another solution, which
> > is based on a diff from Marius Strobl, who added those syncs in the
> > first place.  Essentially it splits the memory handling into cacheable
> > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > still don't.  This is a bit of a safer approach imho, since we don't
> > hurt the controller drivers, but speed up the data buffers.
> >
> > Once we have verified that there are no regressions, we can adjust
> > ehci(4) and the like, add proper syncs, make sure they still work as
> > well as before, and maybe then back this out again.
> >
> > Keep note that this is all a no-op on X86, but all the other archs will
> > profit from this.
> >
> > ok?
> >
> > Patrick
>
> Update diff with inverted logic.  kettenis@ argues that we should
> invert the logic, and those who need COHERENT memory should ask
> for that explicitly, since for bus_dmamem_map() it also needs to
> be passed explicitly.  This also points out all those users that
> use usb_allocmem() internally, where we might want to have a look
> if COHERENT is actually needed or not, or if it can be refactored
> in another way.

These commits broke usb on imx.6 with cubox:

imxehci0 at simplebus3
usb0 at imxehci0: USB revision 2.0
usb0: root hub problem
imxehci1 at simplebus3
usb1 at imxehci1: USB revision 2.0
usb1: root hub problem
"usbmisc" at simplebus3 not configured

After reverting them I can use filesystems on usb again.

diff --git sys/dev/usb/dwc2/dwc2.c sys/dev/usb/dwc2/dwc2.c
index 6ca3cc658e5..6f035467213 100644
--- sys/dev/usb/dwc2/dwc2.c
+++ sys/dev/usb/dwc2/dwc2.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: dwc2.c,v 1.51 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: dwc2.c,v 1.49 2019/11/27 11:16:59 mpi Exp $ */
 /* $NetBSD: dwc2.c,v 1.32 2014/09/02 23:26:20 macallan Exp $ */
 
 /*-
@@ -474,7 +474,7 @@ dwc2_open(struct usbd_pipe *pipe)
  case UE_CONTROL:
  pipe->methods = &dwc2_device_ctrl_methods;
  err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
-    0, USB_DMA_COHERENT, &dpipe->req_dma);
+    0, &dpipe->req_dma);
  if (err)
  return err;
  break;
diff --git sys/dev/usb/dwc2/dwc2_hcd.c sys/dev/usb/dwc2/dwc2_hcd.c
index ab76de7d766..7e5c91481d5 100644
--- sys/dev/usb/dwc2/dwc2_hcd.c
+++ sys/dev/usb/dwc2/dwc2_hcd.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: dwc2_hcd.c,v 1.22 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: dwc2_hcd.c,v 1.20 2017/09/08 05:36:53 deraadt Exp $ */
 /* $NetBSD: dwc2_hcd.c,v 1.15 2014/11/24 10:14:14 skrll Exp $ */
 
 /*
@@ -680,7 +680,7 @@ STATIC int dwc2_hc_setup_align_buf(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh,
  qh->dw_align_buf = NULL;
  qh->dw_align_buf_dma = 0;
  err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, buf_size, buf_size,
-   USB_DMA_COHERENT, &qh->dw_align_buf_usbdma);
+   &qh->dw_align_buf_usbdma);
  if (!err) {
  struct usb_dma *ud = &qh->dw_align_buf_usbdma;
 
@@ -2269,7 +2269,6 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg,
  if (hsotg->core_params->dma_enable > 0) {
  retval = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
       DWC2_HCD_STATUS_BUF_SIZE, 0,
-      USB_DMA_COHERENT,
       &hsotg->status_buf_usbdma);
  if (!retval) {
  hsotg->status_buf = KERNADDR(&hsotg->status_buf_usbdma, 0);
diff --git sys/dev/usb/dwc2/dwc2_hcdddma.c sys/dev/usb/dwc2/dwc2_hcdddma.c
index 4355fd2d30a..d8584eed50d 100644
--- sys/dev/usb/dwc2/dwc2_hcdddma.c
+++ sys/dev/usb/dwc2/dwc2_hcdddma.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: dwc2_hcdddma.c,v 1.16 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: dwc2_hcdddma.c,v 1.14 2017/09/08 05:36:53 deraadt Exp $ */
 /* $NetBSD: dwc2_hcdddma.c,v 1.6 2014/04/03 06:34:58 skrll Exp $ */
 
 /*
@@ -104,7 +104,7 @@ STATIC int dwc2_desc_list_alloc(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh,
  qh->desc_list = NULL;
  err = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
     sizeof(struct dwc2_hcd_dma_desc) * dwc2_max_desc_num(qh), 0,
-    USB_DMA_COHERENT, &qh->desc_list_usbdma);
+    &qh->desc_list_usbdma);
 
  if (!err) {
  qh->desc_list = KERNADDR(&qh->desc_list_usbdma, 0);
@@ -144,7 +144,7 @@ STATIC int dwc2_frame_list_alloc(struct dwc2_hsotg *hsotg, gfp_t mem_flags)
  /* XXXNH - struct pool */
  hsotg->frame_list = NULL;
  err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, 4 * FRLISTEN_64_SIZE,
-    0, USB_DMA_COHERENT, &hsotg->frame_list_usbdma);
+    0, &hsotg->frame_list_usbdma);
 
  if (!err) {
  hsotg->frame_list = KERNADDR(&hsotg->frame_list_usbdma, 0);
diff --git sys/dev/usb/ehci.c sys/dev/usb/ehci.c
index a352d83eaf4..4bfea89233a 100644
--- sys/dev/usb/ehci.c
+++ sys/dev/usb/ehci.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ehci.c,v 1.209 2020/03/30 22:29:04 krw Exp $ */
+/* $OpenBSD: ehci.c,v 1.206 2020/02/22 14:01:34 jasper Exp $ */
 /* $NetBSD: ehci.c,v 1.66 2004/06/30 03:11:56 mycroft Exp $ */
 
 /*
@@ -356,7 +356,7 @@ ehci_init(struct ehci_softc *sc)
  return (USBD_IOERROR);
  }
  err = usb_allocmem(&sc->sc_bus, sc->sc_flsize * sizeof(ehci_link_t),
-    EHCI_FLALIGN_ALIGN, USB_DMA_COHERENT, &sc->sc_fldma);
+    EHCI_FLALIGN_ALIGN, &sc->sc_fldma);
  if (err)
  return (err);
  DPRINTF(("%s: flsize=%d\n", sc->sc_bus.bdev.dv_xname,sc->sc_flsize));
@@ -1470,7 +1470,7 @@ ehci_open(struct usbd_pipe *pipe)
  switch (xfertype) {
  case UE_CONTROL:
  err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
-    0, USB_DMA_COHERENT, &epipe->u.ctl.reqdma);
+    0, &epipe->u.ctl.reqdma);
  if (err) {
  ehci_free_sqh(sc, sqh);
  return (err);
@@ -2258,7 +2258,7 @@ ehci_alloc_sqh(struct ehci_softc *sc)
  if (sc->sc_freeqhs == NULL) {
  DPRINTFN(2, ("ehci_alloc_sqh: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, EHCI_SQH_SIZE * EHCI_SQH_CHUNK,
-    EHCI_PAGE_SIZE, USB_DMA_COHERENT, &dma);
+    EHCI_PAGE_SIZE, &dma);
  if (err)
  goto out;
  for (i = 0; i < EHCI_SQH_CHUNK; i++) {
@@ -2306,7 +2306,7 @@ ehci_alloc_sqtd(struct ehci_softc *sc)
  if (sc->sc_freeqtds == NULL) {
  DPRINTFN(2, ("ehci_alloc_sqtd: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, EHCI_SQTD_SIZE*EHCI_SQTD_CHUNK,
-    EHCI_PAGE_SIZE, USB_DMA_COHERENT, &dma);
+    EHCI_PAGE_SIZE, &dma);
  if (err)
  goto out;
  for(i = 0; i < EHCI_SQTD_CHUNK; i++) {
@@ -2532,7 +2532,7 @@ ehci_alloc_itd(struct ehci_softc *sc)
 
  if (freeitd == NULL) {
  err = usb_allocmem(&sc->sc_bus, EHCI_ITD_SIZE * EHCI_ITD_CHUNK,
-    EHCI_PAGE_SIZE, USB_DMA_COHERENT, &dma);
+    EHCI_PAGE_SIZE, &dma);
  if (err) {
  splx(s);
  return (NULL);
diff --git sys/dev/usb/ohci.c sys/dev/usb/ohci.c
index 7aa2303eabd..ad3a9811a9c 100644
--- sys/dev/usb/ohci.c
+++ sys/dev/usb/ohci.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ohci.c,v 1.160 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: ohci.c,v 1.158 2020/02/22 14:01:34 jasper Exp $ */
 /* $NetBSD: ohci.c,v 1.139 2003/02/22 05:24:16 tsutsui Exp $ */
 /* $FreeBSD: src/sys/dev/usb/ohci.c,v 1.22 1999/11/17 22:33:40 n_hibma Exp $ */
 
@@ -395,7 +395,7 @@ ohci_alloc_sed(struct ohci_softc *sc)
  if (sc->sc_freeeds == NULL) {
  DPRINTFN(2, ("ohci_alloc_sed: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, OHCI_SED_SIZE * OHCI_SED_CHUNK,
-  OHCI_ED_ALIGN, USB_DMA_COHERENT, &dma);
+  OHCI_ED_ALIGN, &dma);
  if (err)
  goto out;
  for (i = 0; i < OHCI_SED_CHUNK; i++) {
@@ -440,7 +440,7 @@ ohci_alloc_std(struct ohci_softc *sc)
  if (sc->sc_freetds == NULL) {
  DPRINTFN(2, ("ohci_alloc_std: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, OHCI_STD_SIZE * OHCI_STD_CHUNK,
-  OHCI_TD_ALIGN, USB_DMA_COHERENT, &dma);
+  OHCI_TD_ALIGN, &dma);
  if (err)
  goto out;
  for (i = 0; i < OHCI_STD_CHUNK; i++) {
@@ -598,7 +598,7 @@ ohci_alloc_sitd(struct ohci_softc *sc)
  if (sc->sc_freeitds == NULL) {
  DPRINTFN(2, ("ohci_alloc_sitd: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, OHCI_SITD_SIZE * OHCI_SITD_CHUNK,
-  OHCI_ITD_ALIGN, USB_DMA_COHERENT, &dma);
+  OHCI_ITD_ALIGN, &dma);
  if (err)
  return (NULL);
  s = splusb();
@@ -728,8 +728,8 @@ ohci_init(struct ohci_softc *sc)
 
  /* XXX determine alignment by R/W */
  /* Allocate the HCCA area. */
- err = usb_allocmem(&sc->sc_bus, OHCI_HCCA_SIZE, OHCI_HCCA_ALIGN,
-    USB_DMA_COHERENT, &sc->sc_hccadma);
+ err = usb_allocmem(&sc->sc_bus, OHCI_HCCA_SIZE,
+ OHCI_HCCA_ALIGN, &sc->sc_hccadma);
  if (err)
  return (err);
  sc->sc_hcca = KERNADDR(&sc->sc_hccadma, 0);
@@ -1933,8 +1933,7 @@ ohci_open(struct usbd_pipe *pipe)
  pipe->methods = &ohci_device_ctrl_methods;
  err = usb_allocmem(&sc->sc_bus,
   sizeof(usb_device_request_t),
-  0, USB_DMA_COHERENT,
-  &opipe->u.ctl.reqdma);
+  0, &opipe->u.ctl.reqdma);
  if (err)
  goto bad;
  s = splusb();
diff --git sys/dev/usb/uhci.c sys/dev/usb/uhci.c
index bae6c1b81a9..47e6d1678c0 100644
--- sys/dev/usb/uhci.c
+++ sys/dev/usb/uhci.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: uhci.c,v 1.151 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: uhci.c,v 1.149 2020/02/22 14:01:34 jasper Exp $ */
 /* $NetBSD: uhci.c,v 1.172 2003/02/23 04:19:26 simonb Exp $ */
 /* $FreeBSD: src/sys/dev/usb/uhci.c,v 1.33 1999/11/17 22:33:41 n_hibma Exp $ */
 
@@ -379,7 +379,7 @@ uhci_init(struct uhci_softc *sc)
  /* Allocate and initialize real frame array. */
  err = usb_allocmem(&sc->sc_bus,
   UHCI_FRAMELIST_COUNT * sizeof(uhci_physaddr_t),
-  UHCI_FRAMELIST_ALIGN, USB_DMA_COHERENT, &sc->sc_dma);
+  UHCI_FRAMELIST_ALIGN, &sc->sc_dma);
  if (err)
  return (err);
  sc->sc_pframes = KERNADDR(&sc->sc_dma, 0);
@@ -1415,7 +1415,7 @@ uhci_alloc_std(struct uhci_softc *sc)
  if (sc->sc_freetds == NULL) {
  DPRINTFN(2,("uhci_alloc_std: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, UHCI_STD_SIZE * UHCI_STD_CHUNK,
-  UHCI_TD_ALIGN, USB_DMA_COHERENT, &dma);
+  UHCI_TD_ALIGN, &dma);
  if (err)
  goto out;
  for(i = 0; i < UHCI_STD_CHUNK; i++) {
@@ -1469,7 +1469,7 @@ uhci_alloc_sqh(struct uhci_softc *sc)
  if (sc->sc_freeqhs == NULL) {
  DPRINTFN(2, ("uhci_alloc_sqh: allocating chunk\n"));
  err = usb_allocmem(&sc->sc_bus, UHCI_SQH_SIZE * UHCI_SQH_CHUNK,
-  UHCI_QH_ALIGN, USB_DMA_COHERENT, &dma);
+  UHCI_QH_ALIGN, &dma);
  if (err)
  goto out;
  for (i = 0; i < UHCI_SQH_CHUNK; i++) {
@@ -2652,8 +2652,7 @@ uhci_open(struct usbd_pipe *pipe)
  }
  err = usb_allocmem(&sc->sc_bus,
   sizeof(usb_device_request_t),
-  0, USB_DMA_COHERENT,
-  &upipe->u.ctl.reqdma);
+  0, &upipe->u.ctl.reqdma);
  if (err) {
  uhci_free_sqh(sc, upipe->u.ctl.sqh);
  uhci_free_std(sc, upipe->u.ctl.setup);
diff --git sys/dev/usb/usb_mem.c sys/dev/usb/usb_mem.c
index d0b37e54766..c65906b43f4 100644
--- sys/dev/usb/usb_mem.c
+++ sys/dev/usb/usb_mem.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: usb_mem.c,v 1.34 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: usb_mem.c,v 1.32 2018/12/05 17:41:23 gerhard Exp $ */
 /* $NetBSD: usb_mem.c,v 1.26 2003/02/01 06:23:40 thorpej Exp $ */
 
 /*
@@ -72,7 +72,7 @@ struct usb_frag_dma {
 };
 
 usbd_status usb_block_allocmem(bus_dma_tag_t, size_t, size_t,
-    struct usb_dma_block **, int);
+    struct usb_dma_block **);
 void usb_block_freemem(struct usb_dma_block *);
 
 LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
 
 usbd_status
 usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
-    struct usb_dma_block **dmap, int coherent)
+    struct usb_dma_block **dmap)
 {
  int error;
         struct usb_dma_block *p;
@@ -96,8 +96,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  s = splusb();
  /* First check the free list. */
  for (p = LIST_FIRST(&usb_blk_freelist); p; p = LIST_NEXT(p, next)) {
- if (p->tag == tag && p->size >= size && p->align >= align &&
-    p->coherent == coherent) {
+ if (p->tag == tag && p->size >= size && p->align >= align) {
  LIST_REMOVE(p, next);
  usb_blk_nfree--;
  splx(s);
@@ -117,7 +116,6 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  p->tag = tag;
  p->size = size;
  p->align = align;
- p->coherent = coherent;
  error = bus_dmamem_alloc(tag, p->size, align, 0,
  p->segs, nitems(p->segs),
  &p->nsegs, BUS_DMA_NOWAIT);
@@ -125,8 +123,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
  goto free0;
 
  error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
-       &p->kaddr, BUS_DMA_NOWAIT | (coherent ?
-       BUS_DMA_COHERENT : 0));
+       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
  if (error)
  goto free1;
 
@@ -184,25 +181,20 @@ usb_block_freemem(struct usb_dma_block *p)
 }
 
 usbd_status
-usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, int flags,
-    struct usb_dma *p)
+usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, struct usb_dma *p)
 {
  bus_dma_tag_t tag = bus->dmatag;
  usbd_status err;
  struct usb_frag_dma *f;
  struct usb_dma_block *b;
- int coherent;
  int i;
  int s;
 
- coherent = !!(flags & USB_DMA_COHERENT);
-
  /* If the request is large then just use a full block. */
  if (size > USB_MEM_SMALL || align > USB_MEM_SMALL) {
  DPRINTFN(1, ("%s: large alloc %d\n", __func__, (int)size));
  size = (size + USB_MEM_BLOCK - 1) & ~(USB_MEM_BLOCK - 1);
- err = usb_block_allocmem(tag, size, align, &p->block,
-    coherent);
+ err = usb_block_allocmem(tag, size, align, &p->block);
  if (!err) {
  p->block->frags = NULL;
  p->offs = 0;
@@ -213,12 +205,11 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t align, int flags,
  s = splusb();
  /* Check for free fragments. */
  for (f = LIST_FIRST(&usb_frag_freelist); f; f = LIST_NEXT(f, next))
- if (f->block->tag == tag && f->block->coherent == coherent)
+ if (f->block->tag == tag)
  break;
  if (f == NULL) {
  DPRINTFN(1, ("usb_allocmem: adding fragments\n"));
- err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL, &b,
-    coherent);
+ err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL,&b);
  if (err) {
  splx(s);
  return (err);
diff --git sys/dev/usb/usb_mem.h sys/dev/usb/usb_mem.h
index 98a221eae9a..1ae933f4bd9 100644
--- sys/dev/usb/usb_mem.h
+++ sys/dev/usb/usb_mem.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: usb_mem.h,v 1.17 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: usb_mem.h,v 1.15 2016/11/30 10:19:18 mpi Exp $ */
 /* $NetBSD: usb_mem.h,v 1.20 2003/05/03 18:11:42 wiz Exp $ */
 /* $FreeBSD: src/sys/dev/usb/usb_mem.h,v 1.9 1999/11/17 22:33:47 n_hibma Exp $ */
 
@@ -40,7 +40,6 @@ struct usb_dma_block {
         caddr_t kaddr;
         bus_dma_segment_t segs[1];
         int nsegs;
- int coherent;
         size_t size;
         size_t align;
  struct usb_frag_dma *frags;
@@ -51,7 +50,6 @@ struct usb_dma_block {
 #define KERNADDR(dma, o) \
  ((void *)((char *)((dma)->block->kaddr + (dma)->offs) + (o)))
 
-usbd_status usb_allocmem(struct usbd_bus *, size_t, size_t, int,
-    struct usb_dma *);
+usbd_status usb_allocmem(struct usbd_bus *,size_t,size_t, struct usb_dma *);
 void usb_freemem(struct usbd_bus *, struct usb_dma *);
 void usb_syncmem(struct usb_dma *, bus_addr_t, bus_size_t, int);
diff --git sys/dev/usb/usbdi.c sys/dev/usb/usbdi.c
index 5161afc21e5..aba1220103f 100644
--- sys/dev/usb/usbdi.c
+++ sys/dev/usb/usbdi.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: usbdi.c,v 1.104 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: usbdi.c,v 1.103 2020/02/22 14:01:35 jasper Exp $ */
 /* $NetBSD: usbdi.c,v 1.103 2002/09/27 15:37:38 provos Exp $ */
 /* $FreeBSD: src/sys/dev/usb/usbdi.c,v 1.28 1999/11/17 22:33:49 n_hibma Exp $ */
 
@@ -305,7 +305,7 @@ usbd_transfer(struct usbd_xfer *xfer)
  if (xfer->rqflags & URQ_AUTO_DMABUF)
  printf("usbd_transfer: has old buffer!\n");
 #endif
- err = usb_allocmem(bus, xfer->length, 0, 0, &xfer->dmabuf);
+ err = usb_allocmem(bus, xfer->length, 0, &xfer->dmabuf);
  if (err)
  return (err);
  xfer->rqflags |= URQ_AUTO_DMABUF;
@@ -389,7 +389,7 @@ usbd_alloc_buffer(struct usbd_xfer *xfer, u_int32_t size)
  if (xfer->rqflags & (URQ_DEV_DMABUF | URQ_AUTO_DMABUF))
  printf("usbd_alloc_buffer: xfer already has a buffer\n");
 #endif
- err = usb_allocmem(bus, size, 0, 0, &xfer->dmabuf);
+ err = usb_allocmem(bus, size, 0, &xfer->dmabuf);
  if (err)
  return (NULL);
  xfer->rqflags |= URQ_DEV_DMABUF;
diff --git sys/dev/usb/usbdivar.h sys/dev/usb/usbdivar.h
index 5aa207818f7..6600c402979 100644
--- sys/dev/usb/usbdivar.h
+++ sys/dev/usb/usbdivar.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: usbdivar.h,v 1.81 2020/03/21 12:08:31 patrick Exp $ */
+/* $OpenBSD: usbdivar.h,v 1.79 2018/11/27 14:56:09 mpi Exp $ */
 /* $NetBSD: usbdivar.h,v 1.70 2002/07/11 21:14:36 augustss Exp $ */
 /* $FreeBSD: src/sys/dev/usb/usbdivar.h,v 1.11 1999/11/17 22:33:51 n_hibma Exp $ */
 
@@ -47,7 +47,6 @@ struct usb_dma_block;
 struct usb_dma {
  struct usb_dma_block *block;
  u_int offs;
-#define USB_DMA_COHERENT (1 << 0)
 };
 
 struct usbd_xfer;

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Jonathan Gray-11
On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:

> On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > Hi,
> > >
> > > I've spent a few days investigating why USB ethernet adapters are so
> > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > 200 mbit/s.  Quite a difference.
> > >
> > > sys/dev/usb/usb_mem.c:
> > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > >
> > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > not, so mapping it COHERENT means uncached and thus slow.
> > >
> > > Why do we do that?  Well, when the code was imported in 99, it was
> > > already there.  Since then we have gained infrastructure for DMA
> > > syncs in the USB stack, which I think are proper.
> > >
> > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > >
> > > if (!usbd_xfer_isread(xfer)) {
> > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > >    xfer->length);
> > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > >    BUS_DMASYNC_PREWRITE);
> > > } else
> > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > >    BUS_DMASYNC_PREREAD);
> > > err = pipe->methods->transfer(xfer);
> > >
> > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > >
> > > if (xfer->actlen != 0) {
> > > if (usbd_xfer_isread(xfer)) {
> > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > >    BUS_DMASYNC_POSTREAD);
> > > if (!(xfer->flags & USBD_NO_COPY))
> > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > >    xfer->actlen);
> > > } else
> > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > >    BUS_DMASYNC_POSTWRITE);
> > > }
> > >
> > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > the same backend to allocate their rings.  And I can't vouch for those
> > > drivers' sanity.
> > >
> > > As a first step, I would like to go ahead with another solution, which
> > > is based on a diff from Marius Strobl, who added those syncs in the
> > > first place.  Essentially it splits the memory handling into cacheable
> > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > still don't.  This is a bit of a safer approach imho, since we don't
> > > hurt the controller drivers, but speed up the data buffers.
> > >
> > > Once we have verified that there are no regressions, we can adjust
> > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > well as before, and maybe then back this out again.
> > >
> > > Keep note that this is all a no-op on X86, but all the other archs will
> > > profit from this.
> > >
> > > ok?
> > >
> > > Patrick
> >
> > Update diff with inverted logic.  kettenis@ argues that we should
> > invert the logic, and those who need COHERENT memory should ask
> > for that explicitly, since for bus_dmamem_map() it also needs to
> > be passed explicitly.  This also points out all those users that
> > use usb_allocmem() internally, where we might want to have a look
> > if COHERENT is actually needed or not, or if it can be refactored
> > in another way.
>
> These commits broke usb on imx.6 with cubox:
>
> imxehci0 at simplebus3
> usb0 at imxehci0: USB revision 2.0
> usb0: root hub problem
> imxehci1 at simplebus3
> usb1 at imxehci1: USB revision 2.0
> usb1: root hub problem
> "usbmisc" at simplebus3 not configured

pandaboard with omap4 (another cortex a9) also has broken usb with
the latest snapshot:

omehci0 at simplebus0
usb0 at omehci0: USB revision 2.0
usb0: root hub problem

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:

> On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > Hi,
> > > >
> > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > 200 mbit/s.  Quite a difference.
> > > >
> > > > sys/dev/usb/usb_mem.c:
> > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > >
> > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > not, so mapping it COHERENT means uncached and thus slow.
> > > >
> > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > already there.  Since then we have gained infrastructure for DMA
> > > > syncs in the USB stack, which I think are proper.
> > > >
> > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > >
> > > > if (!usbd_xfer_isread(xfer)) {
> > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > >    xfer->length);
> > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > >    BUS_DMASYNC_PREWRITE);
> > > > } else
> > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > >    BUS_DMASYNC_PREREAD);
> > > > err = pipe->methods->transfer(xfer);
> > > >
> > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > >
> > > > if (xfer->actlen != 0) {
> > > > if (usbd_xfer_isread(xfer)) {
> > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > >    BUS_DMASYNC_POSTREAD);
> > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > >    xfer->actlen);
> > > > } else
> > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > >    BUS_DMASYNC_POSTWRITE);
> > > > }
> > > >
> > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > drivers' sanity.
> > > >
> > > > As a first step, I would like to go ahead with another solution, which
> > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > first place.  Essentially it splits the memory handling into cacheable
> > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > hurt the controller drivers, but speed up the data buffers.
> > > >
> > > > Once we have verified that there are no regressions, we can adjust
> > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > well as before, and maybe then back this out again.
> > > >
> > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > profit from this.
> > > >
> > > > ok?
> > > >
> > > > Patrick
> > >
> > > Update diff with inverted logic.  kettenis@ argues that we should
> > > invert the logic, and those who need COHERENT memory should ask
> > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > be passed explicitly.  This also points out all those users that
> > > use usb_allocmem() internally, where we might want to have a look
> > > if COHERENT is actually needed or not, or if it can be refactored
> > > in another way.
> >
> > These commits broke usb on imx.6 with cubox:
> >
> > imxehci0 at simplebus3
> > usb0 at imxehci0: USB revision 2.0
> > usb0: root hub problem
> > imxehci1 at simplebus3
> > usb1 at imxehci1: USB revision 2.0
> > usb1: root hub problem
> > "usbmisc" at simplebus3 not configured
>
> pandaboard with omap4 (another cortex a9) also has broken usb with
> the latest snapshot:
>
> omehci0 at simplebus0
> usb0 at omehci0: USB revision 2.0
> usb0: root hub problem

I think I know what it is.  When we enqueue a request for the root
hub, the buffer, for which the USB subsystem allocates a DMA buffer,
is filled not by an external device, but by our driver.

Now on completion of that request, since it's doing a READ of the
root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
hasn't flushed the buffer do memory, but simply did a memcpy to the
buffer, the POSTREAD will essentially drop whatever ehci's memcpy
did.

Can you check if that makes a difference?  It essentially forces the
root hub code to flush the data to the caches, so that the POSTREAD
can successfully flush the cache and read the data from memory.  I
wonder if there's a better way of doing this, but I kinda doubt it.

Patrick

diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
index a352d83eaf4..d81901d3762 100644
--- a/sys/dev/usb/ehci.c
+++ b/sys/dev/usb/ehci.c
@@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
  err = USBD_IOERROR;
  goto ret;
  }
+ if (totlen != 0)
+ usb_syncmem(&xfer->dmabuf, 0, totlen,
+    BUS_DMASYNC_PREWRITE);
  xfer->actlen = totlen;
  err = USBD_NORMAL_COMPLETION;
  ret:

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:

> On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > Hi,
> > > > >
> > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > 200 mbit/s.  Quite a difference.
> > > > >
> > > > > sys/dev/usb/usb_mem.c:
> > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > >
> > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > >
> > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > syncs in the USB stack, which I think are proper.
> > > > >
> > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > >
> > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > > >    xfer->length);
> > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > >    BUS_DMASYNC_PREWRITE);
> > > > > } else
> > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > >    BUS_DMASYNC_PREREAD);
> > > > > err = pipe->methods->transfer(xfer);
> > > > >
> > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > >
> > > > > if (xfer->actlen != 0) {
> > > > > if (usbd_xfer_isread(xfer)) {
> > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > >    BUS_DMASYNC_POSTREAD);
> > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > > >    xfer->actlen);
> > > > > } else
> > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > >    BUS_DMASYNC_POSTWRITE);
> > > > > }
> > > > >
> > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > drivers' sanity.
> > > > >
> > > > > As a first step, I would like to go ahead with another solution, which
> > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > hurt the controller drivers, but speed up the data buffers.
> > > > >
> > > > > Once we have verified that there are no regressions, we can adjust
> > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > well as before, and maybe then back this out again.
> > > > >
> > > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > > profit from this.
> > > > >
> > > > > ok?
> > > > >
> > > > > Patrick
> > > >
> > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > invert the logic, and those who need COHERENT memory should ask
> > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > be passed explicitly.  This also points out all those users that
> > > > use usb_allocmem() internally, where we might want to have a look
> > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > in another way.
> > >
> > > These commits broke usb on imx.6 with cubox:
> > >
> > > imxehci0 at simplebus3
> > > usb0 at imxehci0: USB revision 2.0
> > > usb0: root hub problem
> > > imxehci1 at simplebus3
> > > usb1 at imxehci1: USB revision 2.0
> > > usb1: root hub problem
> > > "usbmisc" at simplebus3 not configured
> >
> > pandaboard with omap4 (another cortex a9) also has broken usb with
> > the latest snapshot:
> >
> > omehci0 at simplebus0
> > usb0 at omehci0: USB revision 2.0
> > usb0: root hub problem
>
> I think I know what it is.  When we enqueue a request for the root
> hub, the buffer, for which the USB subsystem allocates a DMA buffer,
> is filled not by an external device, but by our driver.
>
> Now on completion of that request, since it's doing a READ of the
> root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
> hasn't flushed the buffer do memory, but simply did a memcpy to the
> buffer, the POSTREAD will essentially drop whatever ehci's memcpy
> did.
>
> Can you check if that makes a difference?  It essentially forces the
> root hub code to flush the data to the caches, so that the POSTREAD
> can successfully flush the cache and read the data from memory.  I
> wonder if there's a better way of doing this, but I kinda doubt it.
>
> Patrick
>
> diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
> index a352d83eaf4..d81901d3762 100644
> --- a/sys/dev/usb/ehci.c
> +++ b/sys/dev/usb/ehci.c
> @@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
>   err = USBD_IOERROR;
>   goto ret;
>   }
> + if (totlen != 0)
> + usb_syncmem(&xfer->dmabuf, 0, totlen,
> +    BUS_DMASYNC_PREWRITE);
>   xfer->actlen = totlen;
>   err = USBD_NORMAL_COMPLETION;
>   ret:
>

If that diff does indeed work, I think a better fix would be to move the
DMA syncs from the USB layer into the driver layer, so that for the root
methods no sync has to be done, but all the individual device methods do
proper syncs.  Preparing that diff would take some more minutes though.
In the meantime we could use USB_DMA_COHERENT for all requests without
an extra allocated buffer, which includes all the control requests.
Then move the syncs to the drivers, and at last remove the flag again.

diff --git a/sys/dev/usb/usbdi.c b/sys/dev/usb/usbdi.c
index 5161afc21e5..a97e22b64a3 100644
--- a/sys/dev/usb/usbdi.c
+++ b/sys/dev/usb/usbdi.c
@@ -305,7 +305,8 @@ usbd_transfer(struct usbd_xfer *xfer)
  if (xfer->rqflags & URQ_AUTO_DMABUF)
  printf("usbd_transfer: has old buffer!\n");
 #endif
- err = usb_allocmem(bus, xfer->length, 0, 0, &xfer->dmabuf);
+ err = usb_allocmem(bus, xfer->length, 0, USB_DMA_COHERENT,
+    &xfer->dmabuf);
  if (err)
  return (err);
  xfer->rqflags |= URQ_AUTO_DMABUF;

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Mark Kettenis
> Date: Wed, 1 Apr 2020 09:40:05 +0200
> From: Patrick Wildt <[hidden email]>
>
> On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > Hi,
> > > > > >
> > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > > 200 mbit/s.  Quite a difference.
> > > > > >
> > > > > > sys/dev/usb/usb_mem.c:
> > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > >
> > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > >
> > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > syncs in the USB stack, which I think are proper.
> > > > > >
> > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > >
> > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > > > >    xfer->length);
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > >    BUS_DMASYNC_PREWRITE);
> > > > > > } else
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > >    BUS_DMASYNC_PREREAD);
> > > > > > err = pipe->methods->transfer(xfer);
> > > > > >
> > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > >
> > > > > > if (xfer->actlen != 0) {
> > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > >    BUS_DMASYNC_POSTREAD);
> > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > > > >    xfer->actlen);
> > > > > > } else
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > >    BUS_DMASYNC_POSTWRITE);
> > > > > > }
> > > > > >
> > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > > drivers' sanity.
> > > > > >
> > > > > > As a first step, I would like to go ahead with another solution, which
> > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > >
> > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > well as before, and maybe then back this out again.
> > > > > >
> > > > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > > > profit from this.
> > > > > >
> > > > > > ok?
> > > > > >
> > > > > > Patrick
> > > > >
> > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > be passed explicitly.  This also points out all those users that
> > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > in another way.
> > > >
> > > > These commits broke usb on imx.6 with cubox:
> > > >
> > > > imxehci0 at simplebus3
> > > > usb0 at imxehci0: USB revision 2.0
> > > > usb0: root hub problem
> > > > imxehci1 at simplebus3
> > > > usb1 at imxehci1: USB revision 2.0
> > > > usb1: root hub problem
> > > > "usbmisc" at simplebus3 not configured
> > >
> > > pandaboard with omap4 (another cortex a9) also has broken usb with
> > > the latest snapshot:
> > >
> > > omehci0 at simplebus0
> > > usb0 at omehci0: USB revision 2.0
> > > usb0: root hub problem
> >
> > I think I know what it is.  When we enqueue a request for the root
> > hub, the buffer, for which the USB subsystem allocates a DMA buffer,
> > is filled not by an external device, but by our driver.
> >
> > Now on completion of that request, since it's doing a READ of the
> > root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
> > hasn't flushed the buffer do memory, but simply did a memcpy to the
> > buffer, the POSTREAD will essentially drop whatever ehci's memcpy
> > did.
> >
> > Can you check if that makes a difference?  It essentially forces the
> > root hub code to flush the data to the caches, so that the POSTREAD
> > can successfully flush the cache and read the data from memory.  I
> > wonder if there's a better way of doing this, but I kinda doubt it.
> >
> > Patrick
> >
> > diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
> > index a352d83eaf4..d81901d3762 100644
> > --- a/sys/dev/usb/ehci.c
> > +++ b/sys/dev/usb/ehci.c
> > @@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
> >   err = USBD_IOERROR;
> >   goto ret;
> >   }
> > + if (totlen != 0)
> > + usb_syncmem(&xfer->dmabuf, 0, totlen,
> > +    BUS_DMASYNC_PREWRITE);
> >   xfer->actlen = totlen;
> >   err = USBD_NORMAL_COMPLETION;
> >   ret:
> >
>
> If that diff does indeed work, I think a better fix would be to move the
> DMA syncs from the USB layer into the driver layer, so that for the root
> methods no sync has to be done, but all the individual device methods do
> proper syncs.  Preparing that diff would take some more minutes though.
> In the meantime we could use USB_DMA_COHERENT for all requests without
> an extra allocated buffer, which includes all the control requests.
> Then move the syncs to the drivers, and at last remove the flag again.

This diff works as well.  I suggest you commit it while we discuss
other options.

ok kettenis@

> diff --git a/sys/dev/usb/usbdi.c b/sys/dev/usb/usbdi.c
> index 5161afc21e5..a97e22b64a3 100644
> --- a/sys/dev/usb/usbdi.c
> +++ b/sys/dev/usb/usbdi.c
> @@ -305,7 +305,8 @@ usbd_transfer(struct usbd_xfer *xfer)
>   if (xfer->rqflags & URQ_AUTO_DMABUF)
>   printf("usbd_transfer: has old buffer!\n");
>  #endif
> - err = usb_allocmem(bus, xfer->length, 0, 0, &xfer->dmabuf);
> + err = usb_allocmem(bus, xfer->length, 0, USB_DMA_COHERENT,
> +    &xfer->dmabuf);
>   if (err)
>   return (err);
>   xfer->rqflags |= URQ_AUTO_DMABUF;
>
>

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Martin Pieuchot
On 01/04/20(Wed) 10:06, Mark Kettenis wrote:

> > Date: Wed, 1 Apr 2020 09:40:05 +0200
> > From: Patrick Wildt <[hidden email]>
> >
> > On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > > Hi,
> > > > > > >
> > > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > > > 200 mbit/s.  Quite a difference.
> > > > > > >
> > > > > > > sys/dev/usb/usb_mem.c:
> > > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > >
> > > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > >
> > > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > > syncs in the USB stack, which I think are proper.
> > > > > > >
> > > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > >
> > > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > > > > >    xfer->length);
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > > >    BUS_DMASYNC_PREWRITE);
> > > > > > > } else
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > > >    BUS_DMASYNC_PREREAD);
> > > > > > > err = pipe->methods->transfer(xfer);
> > > > > > >
> > > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > >
> > > > > > > if (xfer->actlen != 0) {
> > > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > > >    BUS_DMASYNC_POSTREAD);
> > > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > > > > >    xfer->actlen);
> > > > > > > } else
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > > >    BUS_DMASYNC_POSTWRITE);
> > > > > > > }
> > > > > > >
> > > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > > > drivers' sanity.
> > > > > > >
> > > > > > > As a first step, I would like to go ahead with another solution, which
> > > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > >
> > > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > > well as before, and maybe then back this out again.
> > > > > > >
> > > > > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > > > > profit from this.
> > > > > > >
> > > > > > > ok?
> > > > > > >
> > > > > > > Patrick
> > > > > >
> > > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > > be passed explicitly.  This also points out all those users that
> > > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > > in another way.
> > > > >
> > > > > These commits broke usb on imx.6 with cubox:
> > > > >
> > > > > imxehci0 at simplebus3
> > > > > usb0 at imxehci0: USB revision 2.0
> > > > > usb0: root hub problem
> > > > > imxehci1 at simplebus3
> > > > > usb1 at imxehci1: USB revision 2.0
> > > > > usb1: root hub problem
> > > > > "usbmisc" at simplebus3 not configured
> > > >
> > > > pandaboard with omap4 (another cortex a9) also has broken usb with
> > > > the latest snapshot:
> > > >
> > > > omehci0 at simplebus0
> > > > usb0 at omehci0: USB revision 2.0
> > > > usb0: root hub problem
> > >
> > > I think I know what it is.  When we enqueue a request for the root
> > > hub, the buffer, for which the USB subsystem allocates a DMA buffer,
> > > is filled not by an external device, but by our driver.
> > >
> > > Now on completion of that request, since it's doing a READ of the
> > > root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
> > > hasn't flushed the buffer do memory, but simply did a memcpy to the
> > > buffer, the POSTREAD will essentially drop whatever ehci's memcpy
> > > did.
> > >
> > > Can you check if that makes a difference?  It essentially forces the
> > > root hub code to flush the data to the caches, so that the POSTREAD
> > > can successfully flush the cache and read the data from memory.  I
> > > wonder if there's a better way of doing this, but I kinda doubt it.
> > >
> > > Patrick
> > >
> > > diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
> > > index a352d83eaf4..d81901d3762 100644
> > > --- a/sys/dev/usb/ehci.c
> > > +++ b/sys/dev/usb/ehci.c
> > > @@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
> > >   err = USBD_IOERROR;
> > >   goto ret;
> > >   }
> > > + if (totlen != 0)
> > > + usb_syncmem(&xfer->dmabuf, 0, totlen,
> > > +    BUS_DMASYNC_PREWRITE);
> > >   xfer->actlen = totlen;
> > >   err = USBD_NORMAL_COMPLETION;
> > >   ret:
> > >
> >
> > If that diff does indeed work, I think a better fix would be to move the
> > DMA syncs from the USB layer into the driver layer, so that for the root
> > methods no sync has to be done, but all the individual device methods do
> > proper syncs.  Preparing that diff would take some more minutes though.
> > In the meantime we could use USB_DMA_COHERENT for all requests without
> > an extra allocated buffer, which includes all the control requests.
> > Then move the syncs to the drivers, and at last remove the flag again.
>
> This diff works as well.  I suggest you commit it while we discuss
> other options.

I agree, although I wouldn't mind a comment explaining why this is a
"temporary" workaround.  Because as we all know "temporary" is very
relative :o)

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
In reply to this post by Patrick Wildt-3
On Wed, Apr 01, 2020 at 09:40:06AM +0200, Patrick Wildt wrote:

> On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > Hi,
> > > > > >
> > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > > 200 mbit/s.  Quite a difference.
> > > > > >
> > > > > > sys/dev/usb/usb_mem.c:
> > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > >
> > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > >
> > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > syncs in the USB stack, which I think are proper.
> > > > > >
> > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > >
> > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > > > >    xfer->length);
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > >    BUS_DMASYNC_PREWRITE);
> > > > > > } else
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > >    BUS_DMASYNC_PREREAD);
> > > > > > err = pipe->methods->transfer(xfer);
> > > > > >
> > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > >
> > > > > > if (xfer->actlen != 0) {
> > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > >    BUS_DMASYNC_POSTREAD);
> > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > > > >    xfer->actlen);
> > > > > > } else
> > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > >    BUS_DMASYNC_POSTWRITE);
> > > > > > }
> > > > > >
> > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > > drivers' sanity.
> > > > > >
> > > > > > As a first step, I would like to go ahead with another solution, which
> > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > >
> > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > well as before, and maybe then back this out again.
> > > > > >
> > > > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > > > profit from this.
> > > > > >
> > > > > > ok?
> > > > > >
> > > > > > Patrick
> > > > >
> > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > be passed explicitly.  This also points out all those users that
> > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > in another way.
> > > >
> > > > These commits broke usb on imx.6 with cubox:
> > > >
> > > > imxehci0 at simplebus3
> > > > usb0 at imxehci0: USB revision 2.0
> > > > usb0: root hub problem
> > > > imxehci1 at simplebus3
> > > > usb1 at imxehci1: USB revision 2.0
> > > > usb1: root hub problem
> > > > "usbmisc" at simplebus3 not configured
> > >
> > > pandaboard with omap4 (another cortex a9) also has broken usb with
> > > the latest snapshot:
> > >
> > > omehci0 at simplebus0
> > > usb0 at omehci0: USB revision 2.0
> > > usb0: root hub problem
> >
> > I think I know what it is.  When we enqueue a request for the root
> > hub, the buffer, for which the USB subsystem allocates a DMA buffer,
> > is filled not by an external device, but by our driver.
> >
> > Now on completion of that request, since it's doing a READ of the
> > root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
> > hasn't flushed the buffer do memory, but simply did a memcpy to the
> > buffer, the POSTREAD will essentially drop whatever ehci's memcpy
> > did.
> >
> > Can you check if that makes a difference?  It essentially forces the
> > root hub code to flush the data to the caches, so that the POSTREAD
> > can successfully flush the cache and read the data from memory.  I
> > wonder if there's a better way of doing this, but I kinda doubt it.
> >
> > Patrick
> >
> > diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
> > index a352d83eaf4..d81901d3762 100644
> > --- a/sys/dev/usb/ehci.c
> > +++ b/sys/dev/usb/ehci.c
> > @@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
> >   err = USBD_IOERROR;
> >   goto ret;
> >   }
> > + if (totlen != 0)
> > + usb_syncmem(&xfer->dmabuf, 0, totlen,
> > +    BUS_DMASYNC_PREWRITE);
> >   xfer->actlen = totlen;
> >   err = USBD_NORMAL_COMPLETION;
> >   ret:
> >
>
> If that diff does indeed work, I think a better fix would be to move the
> DMA syncs from the USB layer into the driver layer, so that for the root
> methods no sync has to be done, but all the individual device methods do
> proper syncs.  Preparing that diff would take some more minutes though.
> In the meantime we could use USB_DMA_COHERENT for all requests without
> an extra allocated buffer, which includes all the control requests.
> Then move the syncs to the drivers, and at last remove the flag again.

Since kettenis@ wondered how that should look like, I took the time to
look at eHCI and here's how I approached it.  No diff yet, I'll come up
with that soon.

======
Before Start of Transfer:  We need to make sure that the buffers are
  synced before handing them to the hardware.

Check device_*_start to sync &xfer->dmabuf.  Easiest way is to check,
where DMAADDR(&xfer->dmabuf) used.  That's where we tell the hardware
where our transfer buffer is.  So those indicate places where we should
sync before telling the HW to go.

* ehci_pcd(): No device transfer, just memset, sync would be wrong. OK
* ehci_root_ctrl_start(): No device transfer, sync would be wrong. OK
* ehci_alloc_sqtd_chain(): chain is allocated based on the buffer, we
  already call usb_syncmem() for the buffer. OK
* ehci_device_intr_done(): called _after_ the transfer completes and
  after we memcpy the data from a successful transfer. on repeat, where
  it enqueues the transfer again, it calls usb_syncmem() for POST.  I
  think that one is nonsense, and should have been done before calling
  usb_transfer_complete().  Then it calls ehci_alloc_sqtd_chain(), which
  does a proper PRE-sync. NOT OK
* ehci_alloc_itd_chain(): Similar to alloc_sqtd_chain(), but there's no
  sync. NOT OK
* ehci_alloc_sitd_chain(): Similar to alloc_sqtd_chain(), but there's no
  sync. NOT OK
======
Before Transfer Completion: We need to make sure buffers are synced
before handing them back to the stack.

Check usb_transfer_complete() callers to have synced &xfer->dmabuf.  But
not those for error handling, only those were we supposedly have buffers
with valid data.

* ehci_isoc_idone(): only the descriptors are synced, probably needs a
  sync on the buffer. NOT OK
* ehci_idone(): only the descriptors are synced, probably needs a sync on
  the buffer, especially since there's the following comment. NOT OK

        /* XXX transfer_complete memcpys out transfer data (for in endpoints)
         * during this call, before methods->done is called: dma sync required
         * beforehand? */
        usb_transfer_complete(xfer);

* ehci_root_ctrl_start: No device transfer, same code path as in the
  other part. OK.
* all other functions: error handling, no successfull completion, no sync
  needed. OK.
====

So, to summarize:

* ehci_alloc_itd_chain() and ehci_alloc_sitd_chain() both should sync
  so that the buffers handed to the hardware are fine.
* ehci_isoc_idone() and ehci_idone() both should sync the buffers
  modified by the hardware, before handing them to the stack.
* ehci_device_intr_done() doesn't need to call usb_syncmem again,
  after we have fixed ehci_idone() to do the sync.

With those changes, removing USB_DMA_COHERENT from the usb_allocmem()
again, as well as removing the usb_syncmem() calls in usbdi.c, should
make the Cubox-i work.  I'll come up with a diff and see if I can find
some Cubox-i to test it with.

Though we'll still need to check ohci, uhci, xhci and dwc2 before we
can actually rip that out again.

Patrick

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
On Wed, Apr 01, 2020 at 12:04:25PM +0200, Patrick Wildt wrote:

> On Wed, Apr 01, 2020 at 09:40:06AM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > > Hi,
> > > > > > >
> > > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > > > 200 mbit/s.  Quite a difference.
> > > > > > >
> > > > > > > sys/dev/usb/usb_mem.c:
> > > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > >
> > > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > >
> > > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > > syncs in the USB stack, which I think are proper.
> > > > > > >
> > > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > >
> > > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > > > > >    xfer->length);
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > > >    BUS_DMASYNC_PREWRITE);
> > > > > > > } else
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > > >    BUS_DMASYNC_PREREAD);
> > > > > > > err = pipe->methods->transfer(xfer);
> > > > > > >
> > > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > >
> > > > > > > if (xfer->actlen != 0) {
> > > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > > >    BUS_DMASYNC_POSTREAD);
> > > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > > > > >    xfer->actlen);
> > > > > > > } else
> > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > > >    BUS_DMASYNC_POSTWRITE);
> > > > > > > }
> > > > > > >
> > > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > > > drivers' sanity.
> > > > > > >
> > > > > > > As a first step, I would like to go ahead with another solution, which
> > > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > >
> > > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > > well as before, and maybe then back this out again.
> > > > > > >
> > > > > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > > > > profit from this.
> > > > > > >
> > > > > > > ok?
> > > > > > >
> > > > > > > Patrick
> > > > > >
> > > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > > be passed explicitly.  This also points out all those users that
> > > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > > in another way.
> > > > >
> > > > > These commits broke usb on imx.6 with cubox:
> > > > >
> > > > > imxehci0 at simplebus3
> > > > > usb0 at imxehci0: USB revision 2.0
> > > > > usb0: root hub problem
> > > > > imxehci1 at simplebus3
> > > > > usb1 at imxehci1: USB revision 2.0
> > > > > usb1: root hub problem
> > > > > "usbmisc" at simplebus3 not configured
> > > >
> > > > pandaboard with omap4 (another cortex a9) also has broken usb with
> > > > the latest snapshot:
> > > >
> > > > omehci0 at simplebus0
> > > > usb0 at omehci0: USB revision 2.0
> > > > usb0: root hub problem
> > >
> > > I think I know what it is.  When we enqueue a request for the root
> > > hub, the buffer, for which the USB subsystem allocates a DMA buffer,
> > > is filled not by an external device, but by our driver.
> > >
> > > Now on completion of that request, since it's doing a READ of the
> > > root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
> > > hasn't flushed the buffer do memory, but simply did a memcpy to the
> > > buffer, the POSTREAD will essentially drop whatever ehci's memcpy
> > > did.
> > >
> > > Can you check if that makes a difference?  It essentially forces the
> > > root hub code to flush the data to the caches, so that the POSTREAD
> > > can successfully flush the cache and read the data from memory.  I
> > > wonder if there's a better way of doing this, but I kinda doubt it.
> > >
> > > Patrick
> > >
> > > diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
> > > index a352d83eaf4..d81901d3762 100644
> > > --- a/sys/dev/usb/ehci.c
> > > +++ b/sys/dev/usb/ehci.c
> > > @@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
> > >   err = USBD_IOERROR;
> > >   goto ret;
> > >   }
> > > + if (totlen != 0)
> > > + usb_syncmem(&xfer->dmabuf, 0, totlen,
> > > +    BUS_DMASYNC_PREWRITE);
> > >   xfer->actlen = totlen;
> > >   err = USBD_NORMAL_COMPLETION;
> > >   ret:
> > >
> >
> > If that diff does indeed work, I think a better fix would be to move the
> > DMA syncs from the USB layer into the driver layer, so that for the root
> > methods no sync has to be done, but all the individual device methods do
> > proper syncs.  Preparing that diff would take some more minutes though.
> > In the meantime we could use USB_DMA_COHERENT for all requests without
> > an extra allocated buffer, which includes all the control requests.
> > Then move the syncs to the drivers, and at last remove the flag again.
>
> Since kettenis@ wondered how that should look like, I took the time to
> look at eHCI and here's how I approached it.  No diff yet, I'll come up
> with that soon.
>
> ======
> Before Start of Transfer:  We need to make sure that the buffers are
>   synced before handing them to the hardware.
>
> Check device_*_start to sync &xfer->dmabuf.  Easiest way is to check,
> where DMAADDR(&xfer->dmabuf) used.  That's where we tell the hardware
> where our transfer buffer is.  So those indicate places where we should
> sync before telling the HW to go.
>
> * ehci_pcd(): No device transfer, just memset, sync would be wrong. OK
> * ehci_root_ctrl_start(): No device transfer, sync would be wrong. OK
> * ehci_alloc_sqtd_chain(): chain is allocated based on the buffer, we
>   already call usb_syncmem() for the buffer. OK
> * ehci_device_intr_done(): called _after_ the transfer completes and
>   after we memcpy the data from a successful transfer. on repeat, where
>   it enqueues the transfer again, it calls usb_syncmem() for POST.  I
>   think that one is nonsense, and should have been done before calling
>   usb_transfer_complete().  Then it calls ehci_alloc_sqtd_chain(), which
>   does a proper PRE-sync. NOT OK
> * ehci_alloc_itd_chain(): Similar to alloc_sqtd_chain(), but there's no
>   sync. NOT OK
> * ehci_alloc_sitd_chain(): Similar to alloc_sqtd_chain(), but there's no
>   sync. NOT OK
> ======
> Before Transfer Completion: We need to make sure buffers are synced
> before handing them back to the stack.
>
> Check usb_transfer_complete() callers to have synced &xfer->dmabuf.  But
> not those for error handling, only those were we supposedly have buffers
> with valid data.
>
> * ehci_isoc_idone(): only the descriptors are synced, probably needs a
>   sync on the buffer. NOT OK
> * ehci_idone(): only the descriptors are synced, probably needs a sync on
>   the buffer, especially since there's the following comment. NOT OK
>
> /* XXX transfer_complete memcpys out transfer data (for in endpoints)
> * during this call, before methods->done is called: dma sync required
> * beforehand? */
> usb_transfer_complete(xfer);
>
> * ehci_root_ctrl_start: No device transfer, same code path as in the
>   other part. OK.
> * all other functions: error handling, no successfull completion, no sync
>   needed. OK.
> ====
>
> So, to summarize:
>
> * ehci_alloc_itd_chain() and ehci_alloc_sitd_chain() both should sync
>   so that the buffers handed to the hardware are fine.
> * ehci_isoc_idone() and ehci_idone() both should sync the buffers
>   modified by the hardware, before handing them to the stack.
> * ehci_device_intr_done() doesn't need to call usb_syncmem again,
>   after we have fixed ehci_idone() to do the sync.
>
> With those changes, removing USB_DMA_COHERENT from the usb_allocmem()
> again, as well as removing the usb_syncmem() calls in usbdi.c, should
> make the Cubox-i work.  I'll come up with a diff and see if I can find
> some Cubox-i to test it with.
>
> Though we'll still need to check ohci, uhci, xhci and dwc2 before we
> can actually rip that out again.
>
> Patrick

And that's the diff I'd try based on the analysis.

Keep note that the reason the isochronous completion uses xfer->length
instead of xfer->actlen is, that isochronous transfers don't use
continuous buffers.  So it's either going through each successful frame
and doing a sync per frame, or simply syncing the whole buffer.

diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
index a352d83eaf4..e70eea42aa7 100644
--- a/sys/dev/usb/ehci.c
+++ b/sys/dev/usb/ehci.c
@@ -856,6 +856,10 @@ ehci_isoc_idone(struct usbd_xfer *xfer)
 #endif
  xfer->actlen = actlen;
  xfer->status = USBD_NORMAL_COMPLETION;
+
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  usb_transfer_complete(xfer);
 }
 
@@ -911,9 +915,9 @@ ehci_idone(struct usbd_xfer *xfer)
  } else
  xfer->status = USBD_NORMAL_COMPLETION;
 
- /* XXX transfer_complete memcpys out transfer data (for in endpoints)
- * during this call, before methods->done is called: dma sync required
- * beforehand? */
+ usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  usb_transfer_complete(xfer);
  DPRINTFN(/*12*/2, ("ehci_idone: ex=%p done\n", ex));
 }
@@ -3208,9 +3212,6 @@ ehci_device_intr_done(struct usbd_xfer *xfer)
  if (xfer->pipe->repeat) {
  ehci_free_sqtd_chain(sc, ex);
 
- usb_syncmem(&xfer->dmabuf, 0, xfer->length,
-    usbd_xfer_isread(xfer) ?
-    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  sqh = epipe->sqh;
 
  err = ehci_alloc_sqtd_chain(sc, xfer->length, xfer, &data, &dataend);
@@ -3408,6 +3409,9 @@ ehci_alloc_itd_chain(struct ehci_softc *sc, struct usbd_xfer *xfer)
  if (nframes == 0)
  return (1);
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
  for (i = 0; i < nframes; i++) {
  uint32_t froffs = offs;
 
@@ -3522,6 +3526,9 @@ ehci_alloc_sitd_chain(struct ehci_softc *sc, struct usbd_xfer *xfer)
  if (usbd_xfer_isread(xfer))
  endp |= EHCI_SITD_SET_DIR(1);
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
  for (i = 0; i < nframes; i++) {
  uint32_t addr = DMAADDR(&xfer->dmabuf, offs);
  uint32_t page = EHCI_PAGE(addr + xfer->frlengths[i] - 1);

Reply | Threaded
Open this post in threaded view
|

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

Patrick Wildt-3
On Wed, Apr 01, 2020 at 12:23:53PM +0200, Patrick Wildt wrote:

> On Wed, Apr 01, 2020 at 12:04:25PM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 09:40:06AM +0200, Patrick Wildt wrote:
> > > On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > > > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > > > Hi,
> > > > > > > >
> > > > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > > > > 200 mbit/s.  Quite a difference.
> > > > > > > >
> > > > > > > > sys/dev/usb/usb_mem.c:
> > > > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > > > >       &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > > >
> > > > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > > >
> > > > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > > > syncs in the USB stack, which I think are proper.
> > > > > > > >
> > > > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > > >
> > > > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > > > memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> > > > > > > >    xfer->length);
> > > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > > > >    BUS_DMASYNC_PREWRITE);
> > > > > > > > } else
> > > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> > > > > > > >    BUS_DMASYNC_PREREAD);
> > > > > > > > err = pipe->methods->transfer(xfer);
> > > > > > > >
> > > > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > > >
> > > > > > > > if (xfer->actlen != 0) {
> > > > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > > > >    BUS_DMASYNC_POSTREAD);
> > > > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > > > memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> > > > > > > >    xfer->actlen);
> > > > > > > > } else
> > > > > > > > usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> > > > > > > >    BUS_DMASYNC_POSTWRITE);
> > > > > > > > }
> > > > > > > >
> > > > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > > > > drivers' sanity.
> > > > > > > >
> > > > > > > > As a first step, I would like to go ahead with another solution, which
> > > > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > > >
> > > > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > > > well as before, and maybe then back this out again.
> > > > > > > >
> > > > > > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > > > > > profit from this.
> > > > > > > >
> > > > > > > > ok?
> > > > > > > >
> > > > > > > > Patrick
> > > > > > >
> > > > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > > > be passed explicitly.  This also points out all those users that
> > > > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > > > in another way.
> > > > > >
> > > > > > These commits broke usb on imx.6 with cubox:
> > > > > >
> > > > > > imxehci0 at simplebus3
> > > > > > usb0 at imxehci0: USB revision 2.0
> > > > > > usb0: root hub problem
> > > > > > imxehci1 at simplebus3
> > > > > > usb1 at imxehci1: USB revision 2.0
> > > > > > usb1: root hub problem
> > > > > > "usbmisc" at simplebus3 not configured
> > > > >
> > > > > pandaboard with omap4 (another cortex a9) also has broken usb with
> > > > > the latest snapshot:
> > > > >
> > > > > omehci0 at simplebus0
> > > > > usb0 at omehci0: USB revision 2.0
> > > > > usb0: root hub problem
> > > >
> > > > I think I know what it is.  When we enqueue a request for the root
> > > > hub, the buffer, for which the USB subsystem allocates a DMA buffer,
> > > > is filled not by an external device, but by our driver.
> > > >
> > > > Now on completion of that request, since it's doing a READ of the
> > > > root hub, it will do a DMA sync with POSTREAD.  Since the ehci code
> > > > hasn't flushed the buffer do memory, but simply did a memcpy to the
> > > > buffer, the POSTREAD will essentially drop whatever ehci's memcpy
> > > > did.
> > > >
> > > > Can you check if that makes a difference?  It essentially forces the
> > > > root hub code to flush the data to the caches, so that the POSTREAD
> > > > can successfully flush the cache and read the data from memory.  I
> > > > wonder if there's a better way of doing this, but I kinda doubt it.
> > > >
> > > > Patrick
> > > >
> > > > diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
> > > > index a352d83eaf4..d81901d3762 100644
> > > > --- a/sys/dev/usb/ehci.c
> > > > +++ b/sys/dev/usb/ehci.c
> > > > @@ -2154,6 +2154,9 @@ ehci_root_ctrl_start(struct usbd_xfer *xfer)
> > > >   err = USBD_IOERROR;
> > > >   goto ret;
> > > >   }
> > > > + if (totlen != 0)
> > > > + usb_syncmem(&xfer->dmabuf, 0, totlen,
> > > > +    BUS_DMASYNC_PREWRITE);
> > > >   xfer->actlen = totlen;
> > > >   err = USBD_NORMAL_COMPLETION;
> > > >   ret:
> > > >
> > >
> > > If that diff does indeed work, I think a better fix would be to move the
> > > DMA syncs from the USB layer into the driver layer, so that for the root
> > > methods no sync has to be done, but all the individual device methods do
> > > proper syncs.  Preparing that diff would take some more minutes though.
> > > In the meantime we could use USB_DMA_COHERENT for all requests without
> > > an extra allocated buffer, which includes all the control requests.
> > > Then move the syncs to the drivers, and at last remove the flag again.
> >
> > Since kettenis@ wondered how that should look like, I took the time to
> > look at eHCI and here's how I approached it.  No diff yet, I'll come up
> > with that soon.
> >
> > ======
> > Before Start of Transfer:  We need to make sure that the buffers are
> >   synced before handing them to the hardware.
> >
> > Check device_*_start to sync &xfer->dmabuf.  Easiest way is to check,
> > where DMAADDR(&xfer->dmabuf) used.  That's where we tell the hardware
> > where our transfer buffer is.  So those indicate places where we should
> > sync before telling the HW to go.
> >
> > * ehci_pcd(): No device transfer, just memset, sync would be wrong. OK
> > * ehci_root_ctrl_start(): No device transfer, sync would be wrong. OK
> > * ehci_alloc_sqtd_chain(): chain is allocated based on the buffer, we
> >   already call usb_syncmem() for the buffer. OK
> > * ehci_device_intr_done(): called _after_ the transfer completes and
> >   after we memcpy the data from a successful transfer. on repeat, where
> >   it enqueues the transfer again, it calls usb_syncmem() for POST.  I
> >   think that one is nonsense, and should have been done before calling
> >   usb_transfer_complete().  Then it calls ehci_alloc_sqtd_chain(), which
> >   does a proper PRE-sync. NOT OK
> > * ehci_alloc_itd_chain(): Similar to alloc_sqtd_chain(), but there's no
> >   sync. NOT OK
> > * ehci_alloc_sitd_chain(): Similar to alloc_sqtd_chain(), but there's no
> >   sync. NOT OK
> > ======
> > Before Transfer Completion: We need to make sure buffers are synced
> > before handing them back to the stack.
> >
> > Check usb_transfer_complete() callers to have synced &xfer->dmabuf.  But
> > not those for error handling, only those were we supposedly have buffers
> > with valid data.
> >
> > * ehci_isoc_idone(): only the descriptors are synced, probably needs a
> >   sync on the buffer. NOT OK
> > * ehci_idone(): only the descriptors are synced, probably needs a sync on
> >   the buffer, especially since there's the following comment. NOT OK
> >
> > /* XXX transfer_complete memcpys out transfer data (for in endpoints)
> > * during this call, before methods->done is called: dma sync required
> > * beforehand? */
> > usb_transfer_complete(xfer);
> >
> > * ehci_root_ctrl_start: No device transfer, same code path as in the
> >   other part. OK.
> > * all other functions: error handling, no successfull completion, no sync
> >   needed. OK.
> > ====
> >
> > So, to summarize:
> >
> > * ehci_alloc_itd_chain() and ehci_alloc_sitd_chain() both should sync
> >   so that the buffers handed to the hardware are fine.
> > * ehci_isoc_idone() and ehci_idone() both should sync the buffers
> >   modified by the hardware, before handing them to the stack.
> > * ehci_device_intr_done() doesn't need to call usb_syncmem again,
> >   after we have fixed ehci_idone() to do the sync.
> >
> > With those changes, removing USB_DMA_COHERENT from the usb_allocmem()
> > again, as well as removing the usb_syncmem() calls in usbdi.c, should
> > make the Cubox-i work.  I'll come up with a diff and see if I can find
> > some Cubox-i to test it with.
> >
> > Though we'll still need to check ohci, uhci, xhci and dwc2 before we
> > can actually rip that out again.
> >
> > Patrick
>
> And that's the diff I'd try based on the analysis.
>
> Keep note that the reason the isochronous completion uses xfer->length
> instead of xfer->actlen is, that isochronous transfers don't use
> continuous buffers.  So it's either going through each successful frame
> and doing a sync per frame, or simply syncing the whole buffer.

Hi,

I went through all the drivers and this is what I came up with.

DWC2: This one felt like the hardest, because the complexity of
the driver is... huge.  It's not a host controller, it's a gadget
controller that we use like a host controller.  Anyway, I figured
there are two relevant points: dwc2_device_start(), where we set
up the transfer.  And there's a nice if-case already where I just
put the PRE-sync.  For the POST-, there is a single exit-point that
we can use, and if we were successful and transfered data, we can do
a sync.

EHCI: I talked about that in the previous mail.

OHCI: bulk&ctrl use ohci_alloc_std_chain() for setting up the DMA,
while intr/isoc have their own specific one.  All xfertypes have
their exit-point in ohci_softintr, so that's an easy point to use
for the syncs.

UHCI: intr&ctrl&bulk use uhci_alloc_std_chain(), only isoc has its
own again.  The single exit-point in uhci_idone makes it easy as
well.

XHCI: the device_*_start() functions each handle the TDs themselves,
so it's easy to add the syncs.  There are two exit points, one for
isoc, one for the "rest".

USBDI: With the syncs now in the driver, we can remove the syncs
there, and also remove the workaround I committed yesterday.

Patrick

diff --git a/sys/dev/usb/dwc2/dwc2.c b/sys/dev/usb/dwc2/dwc2.c
index 6ca3cc658e5..e14dbf02746 100644
--- a/sys/dev/usb/dwc2/dwc2.c
+++ b/sys/dev/usb/dwc2/dwc2.c
@@ -1260,6 +1260,9 @@ dwc2_device_start(struct usbd_xfer *xfer)
  dwc2_urb->usbdma = &xfer->dmabuf;
  dwc2_urb->buf = KERNADDR(dwc2_urb->usbdma, 0);
  dwc2_urb->dma = DMAADDR(dwc2_urb->usbdma, 0);
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
  }
  dwc2_urb->length = len;
  dwc2_urb->flags = flags;
@@ -1649,6 +1652,17 @@ void dwc2_host_complete(struct dwc2_hsotg *hsotg, struct dwc2_qtd *qtd,
  xfer);
  }
 
+ if (xfer->status == USBD_NORMAL_COMPLETION) {
+ if (xfertype == UE_ISOCHRONOUS)
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
+ else if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
+ }
+
  qtd->urb = NULL;
  timeout_del(&xfer->timeout_handle);
  usb_rem_task(xfer->device, &xfer->abort_task);
diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
index a352d83eaf4..94a27f48bd9 100644
--- a/sys/dev/usb/ehci.c
+++ b/sys/dev/usb/ehci.c
@@ -856,6 +856,10 @@ ehci_isoc_idone(struct usbd_xfer *xfer)
 #endif
  xfer->actlen = actlen;
  xfer->status = USBD_NORMAL_COMPLETION;
+
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  usb_transfer_complete(xfer);
 }
 
@@ -911,9 +915,10 @@ ehci_idone(struct usbd_xfer *xfer)
  } else
  xfer->status = USBD_NORMAL_COMPLETION;
 
- /* XXX transfer_complete memcpys out transfer data (for in endpoints)
- * during this call, before methods->done is called: dma sync required
- * beforehand? */
+ if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  usb_transfer_complete(xfer);
  DPRINTFN(/*12*/2, ("ehci_idone: ex=%p done\n", ex));
 }
@@ -3208,9 +3213,6 @@ ehci_device_intr_done(struct usbd_xfer *xfer)
  if (xfer->pipe->repeat) {
  ehci_free_sqtd_chain(sc, ex);
 
- usb_syncmem(&xfer->dmabuf, 0, xfer->length,
-    usbd_xfer_isread(xfer) ?
-    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  sqh = epipe->sqh;
 
  err = ehci_alloc_sqtd_chain(sc, xfer->length, xfer, &data, &dataend);
@@ -3408,6 +3410,9 @@ ehci_alloc_itd_chain(struct ehci_softc *sc, struct usbd_xfer *xfer)
  if (nframes == 0)
  return (1);
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
  for (i = 0; i < nframes; i++) {
  uint32_t froffs = offs;
 
@@ -3522,6 +3527,9 @@ ehci_alloc_sitd_chain(struct ehci_softc *sc, struct usbd_xfer *xfer)
  if (usbd_xfer_isread(xfer))
  endp |= EHCI_SITD_SET_DIR(1);
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
  for (i = 0; i < nframes; i++) {
  uint32_t addr = DMAADDR(&xfer->dmabuf, offs);
  uint32_t page = EHCI_PAGE(addr + xfer->frlengths[i] - 1);
diff --git a/sys/dev/usb/ohci.c b/sys/dev/usb/ohci.c
index 7aa2303eabd..1144228b7dc 100644
--- a/sys/dev/usb/ohci.c
+++ b/sys/dev/usb/ohci.c
@@ -491,6 +491,10 @@ ohci_alloc_std_chain(struct ohci_softc *sc, u_int alen, struct usbd_xfer *xfer,
 
  DPRINTFN(alen < 4096,("ohci_alloc_std_chain: start len=%u\n", alen));
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  len = alen;
  cur = sp;
  end = NULL;
@@ -1279,6 +1283,12 @@ ohci_softintr(void *v)
 
  ohci_free_std(sc, std);
  if (done) {
+ if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0,
+    xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD :
+    BUS_DMASYNC_POSTWRITE);
  xfer->status = USBD_NORMAL_COMPLETION;
  s = splusb();
  usb_transfer_complete(xfer);
@@ -1309,9 +1319,15 @@ ohci_softintr(void *v)
 
  if (cc == OHCI_CC_STALL)
  xfer->status = USBD_STALLED;
- else if (cc == OHCI_CC_DATA_UNDERRUN)
+ else if (cc == OHCI_CC_DATA_UNDERRUN) {
+ if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0,
+    xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD :
+    BUS_DMASYNC_POSTWRITE);
  xfer->status = USBD_NORMAL_COMPLETION;
- else
+ } else
  xfer->status = USBD_IOERROR;
  s = splusb();
  usb_transfer_complete(xfer);
@@ -1389,6 +1405,13 @@ ohci_softintr(void *v)
  xfer->actlen = actlen;
  xfer->hcpriv = NULL;
 
+ if (xfer->status == USBD_NORMAL_COMPLETION) {
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD :
+    BUS_DMASYNC_POSTWRITE);
+ }
+
  s = splusb();
  usb_transfer_complete(xfer);
  splx(s);
@@ -2846,6 +2869,10 @@ ohci_device_intr_start(struct usbd_xfer *xfer)
  panic("ohci_device_intr_transfer: a request");
 #endif
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  len = xfer->length;
  endpt = xfer->pipe->endpoint->edesc->bEndpointAddress;
 
@@ -3057,6 +3084,10 @@ ohci_device_isoc_enter(struct usbd_xfer *xfer)
  if (sc->sc_bus.dying)
  return;
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  if (iso->next == -1) {
  /* Not in use yet, schedule it a few frames ahead. */
  iso->next = letoh32(sc->sc_hcca->hcca_frame_number) + 5;
diff --git a/sys/dev/usb/uhci.c b/sys/dev/usb/uhci.c
index bae6c1b81a9..e3c7c57580f 100644
--- a/sys/dev/usb/uhci.c
+++ b/sys/dev/usb/uhci.c
@@ -1236,6 +1236,9 @@ uhci_idone(struct usbd_xfer *xfer)
  actlen += len;
  }
  upipe->u.iso.inuse -= nframes;
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  xfer->actlen = actlen;
  xfer->status = USBD_NORMAL_COMPLETION;
  goto end;
@@ -1299,6 +1302,10 @@ uhci_idone(struct usbd_xfer *xfer)
  else
  xfer->status = USBD_IOERROR; /* more info XXX */
  } else {
+ if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  xfer->status = USBD_NORMAL_COMPLETION;
  }
 
@@ -1527,6 +1534,10 @@ uhci_alloc_std_chain(struct uhci_softc *sc, u_int len, struct usbd_xfer *xfer,
     __func__, addr, UE_GET_ADDR(endpt), len, xfer->device->speed,
     flags));
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  mps = UGETW(xfer->pipe->endpoint->edesc->wMaxPacketSize);
  if (mps == 0) {
  printf("uhci_alloc_std_chain: mps=0\n");
@@ -2127,6 +2138,10 @@ uhci_device_isoc_enter(struct usbd_xfer *xfer)
  printf("uhci_device_isoc_enter: overflow!\n");
 #endif
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  next = iso->next;
  if (next == -1) {
  /* Not in use yet, schedule it a few frames ahead. */
diff --git a/sys/dev/usb/usbdi.c b/sys/dev/usb/usbdi.c
index f6707432e41..5ba01fbd0bd 100644
--- a/sys/dev/usb/usbdi.c
+++ b/sys/dev/usb/usbdi.c
@@ -305,22 +305,15 @@ usbd_transfer(struct usbd_xfer *xfer)
  if (xfer->rqflags & URQ_AUTO_DMABUF)
  printf("usbd_transfer: has old buffer!\n");
 #endif
- err = usb_allocmem(bus, xfer->length, 0, USB_DMA_COHERENT,
-    &xfer->dmabuf);
+ err = usb_allocmem(bus, xfer->length, 0, 0, &xfer->dmabuf);
  if (err)
  return (err);
  xfer->rqflags |= URQ_AUTO_DMABUF;
  }
 
- if (!usbd_xfer_isread(xfer)) {
- if ((xfer->flags & USBD_NO_COPY) == 0)
- memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
-    xfer->length);
- usb_syncmem(&xfer->dmabuf, 0, xfer->length,
-    BUS_DMASYNC_PREWRITE);
- } else
- usb_syncmem(&xfer->dmabuf, 0, xfer->length,
-    BUS_DMASYNC_PREREAD);
+ if (!usbd_xfer_isread(xfer) && (xfer->flags & USBD_NO_COPY) == 0)
+ memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
+    xfer->length);
 
  usb_tap(bus, xfer, USBTAP_DIR_OUT);
 
@@ -750,17 +743,10 @@ usb_transfer_complete(struct usbd_xfer *xfer)
  }
 #endif
 
- if (xfer->actlen != 0) {
- if (usbd_xfer_isread(xfer)) {
- usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
-    BUS_DMASYNC_POSTREAD);
- if (!(xfer->flags & USBD_NO_COPY))
- memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
-    xfer->actlen);
- } else
- usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
-    BUS_DMASYNC_POSTWRITE);
- }
+ if (usbd_xfer_isread(xfer) && xfer->actlen != 0 &&
+    (xfer->flags & USBD_NO_COPY) == 0)
+ memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
+    xfer->actlen);
 
  /* if we allocated the buffer in usbd_transfer() we free it here. */
  if (xfer->rqflags & URQ_AUTO_DMABUF) {
diff --git a/sys/dev/usb/xhci.c b/sys/dev/usb/xhci.c
index f98f9a0b8fe..b73d4fc2ac2 100644
--- a/sys/dev/usb/xhci.c
+++ b/sys/dev/usb/xhci.c
@@ -851,6 +851,10 @@ xhci_event_xfer_generic(struct xhci_softc *sc, struct usbd_xfer *xfer,
  else
  xfer->actlen = xfer->length;
  }
+ if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  xfer->status = USBD_NORMAL_COMPLETION;
  break;
  case XHCI_CODE_SHORT_XFER:
@@ -871,6 +875,10 @@ xhci_event_xfer_generic(struct xhci_softc *sc, struct usbd_xfer *xfer,
     DEVNAME(sc), xfer, xx->index));
  return (1);
  }
+ if (xfer->actlen)
+ usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  xfer->status = USBD_NORMAL_COMPLETION;
  break;
  case XHCI_CODE_TXERR:
@@ -975,6 +983,9 @@ xhci_event_xfer_isoc(struct usbd_xfer *xfer, struct xhci_pipe *xp,
  xp->skip = 0;
  }
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
  xfer->status = USBD_NORMAL_COMPLETION;
 
  return (0);
@@ -2809,6 +2820,11 @@ xhci_device_ctrl_start(struct usbd_xfer *xfer)
  if (xp->free_trbs < 3)
  return (USBD_NOMEM);
 
+ if (len != 0)
+ usb_syncmem(&xfer->dmabuf, 0, len,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  /* We'll toggle the setup TRB once we're finished with the stages. */
  trb0 = xhci_xfer_get_trb(sc, xfer, &toggle, 0);
 
@@ -2935,6 +2951,10 @@ xhci_device_generic_start(struct usbd_xfer *xfer)
  if (xp->free_trbs < (ntrb + zerotd))
  return (USBD_NOMEM);
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  /* We'll toggle the first TRB once we're finished with the chain. */
  trb0 = xhci_xfer_get_trb(sc, xfer, &toggle, (ntrb == 1));
  flags = XHCI_TRB_TYPE_NORMAL | (toggle ^ 1);
@@ -3091,6 +3111,10 @@ xhci_device_isoc_start(struct usbd_xfer *xfer)
  if (xp->free_trbs < ntrb)
  return (USBD_NOMEM);
 
+ usb_syncmem(&xfer->dmabuf, 0, xfer->length,
+    usbd_xfer_isread(xfer) ?
+    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
+
  paddr = DMAADDR(&xfer->dmabuf, 0);
 
  for (i = 0, trb0 = NULL; i < xfer->nframes; i++) {