bypass support for iommu on sparc64

classic Classic list List threaded Threaded
19 messages Options
Reply | Threaded
Open this post in threaded view
|

bypass support for iommu on sparc64

David Gwynne-5
on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
setting up and tearing down the translation table entries (TTEs)
is very expensive. so expensive that the cost of doing it for disk
io has a noticable impact on compile times.

now that there's a BUS_DMA_64BIT flag, we can use that to decide
to bypass the iommu for devices that set that flag, therefore
avoiding the cost of handling the TTEs.

the following diff adds support for bypass mappings to the iommu
code on sparc64. it's based on a diff from kettenis@ back in 2009.
the main changes are around coping with the differences between
schizo/psycho and fire/oberon.

the differences between the chips are now represented by a iommu_hw
struct. these differences include how to enable the iommu (now via
a function pointer), and masks for bypass addresses.

ive tested this on oberon (on an m4000) and schizo (on a v880).
however, the bypass code isnt working on fire (v245s). to cope with
that for now, the iommu_hw struct lets drivers mask flag bits that
are handled when creating a dmamap. this means fire boards will
ignore BUS_DMA_64BIT until i can figure out whats wrong with them.

i have not tested this on psycho yet. if anyone has such a machine
and is willing to work with me to figure it out, please talk to me.

Index: dev/iommu.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
retrieving revision 1.74
diff -u -p -r1.74 iommu.c
--- dev/iommu.c 30 Apr 2017 16:45:45 -0000 1.74
+++ dev/iommu.c 8 May 2017 00:45:05 -0000
@@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
 void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
     bus_addr_t, bus_size_t, int);
 
+void iommu_hw_enable(struct iommu_state *);
+
+const struct iommu_hw iommu_hw_default = {
+ .ihw_enable = iommu_hw_enable,
+
+ .ihw_dvma_pa = IOTTE_PAMASK,
+
+ .ihw_bypass = 0x3fffUL << 50,
+ .ihw_bypass_nc = 0,
+ .ihw_bypass_ro = 0,
+};
+
+void
+iommu_hw_enable(struct iommu_state *is)
+{
+ IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
+ IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
+}
+
 /*
  * Initiate an STC entry flush.
  */
@@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
  * - create a private DVMA map.
  */
 void
-iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t iovabase)
+iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state *is,
+    int tsbsize, u_int32_t iovabase)
 {
  psize_t size;
  vaddr_t va;
@@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
  * be hard-wired, so we read the start and size from the PROM and
  * just use those values.
  */
- if (strncmp(name, "pyro", 4) == 0) {
- is->is_cr = IOMMUREG_READ(is, iommu_cr);
- is->is_cr &= ~IOMMUCR_FIRE_BE;
- is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
-    IOMMUCR_FIRE_TE);
- } else
- is->is_cr = IOMMUCR_EN;
+
+ is->is_hw = ihw;
+
  is->is_tsbsize = tsbsize;
  if (iovabase == (u_int32_t)-1) {
  is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
@@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
  mtx_init(&is->is_mtx, IPL_HIGH);
 
  /*
- * Set the TSB size.  The relevant bits were moved to the TSB
- * base register in the PCIe host bridges.
- */
- if (strncmp(name, "pyro", 4) == 0)
- is->is_ptsb |= is->is_tsbsize;
- else
- is->is_cr |= (is->is_tsbsize << 16);
-
- /*
  * Now actually start up the IOMMU.
  */
  iommu_reset(is);
@@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
 {
  int i;
 
- IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
-
- /* Enable IOMMU */
- IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
+ (*is->is_hw->ihw_enable)(is);
 
  for (i = 0; i < 2; ++i) {
  struct strbuf_ctl *sb = is->is_sb[i];
@@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
  printf(", STC%d enabled", i);
  }
 
- if (is->is_flags & IOMMU_FLUSH_CACHE)
+ if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
  IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
 }
 
@@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
  if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
  tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
 
- return (tte & IOTTE_PAMASK);
+ return (tte & is->is_hw->ihw_dvma_pa);
 }
 
 /*
@@ -601,8 +605,11 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
 {
  int ret;
  bus_dmamap_t map;
+ struct iommu_state *is = sb->sb_iommu;
  struct iommu_map_state *ims;
 
+ flags &= ~is->is_hw->ihw_dma_flags;
+
  BUS_DMA_FIND_PARENT(t, _dmamap_create);
  ret = (*t->_dmamap_create)(t, t0, size, nsegments, maxsegsz, boundary,
     flags, &map);
@@ -610,6 +617,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
  if (ret)
  return (ret);
 
+ if (flags & BUS_DMA_64BIT) {
+ map->_dm_cookie = is;
+ *dmamap = map;
+ return (0);
+ }
+
  ims = iommu_iomap_create(atop(round_page(size)));
 
  if (ims == NULL) {
@@ -641,8 +654,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
  if (map->dm_nsegs)
  bus_dmamap_unload(t0, map);
 
-        if (map->_dm_cookie)
-                iommu_iomap_destroy(map->_dm_cookie);
+ if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+        if (map->_dm_cookie)
+ iommu_iomap_destroy(map->_dm_cookie);
+ }
  map->_dm_cookie = NULL;
 
  BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
@@ -667,36 +682,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
  u_long dvmaddr, sgstart, sgend;
  bus_size_t align, boundary;
  struct iommu_state *is;
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
  pmap_t pmap;
 
-#ifdef DIAGNOSTIC
- if (ims == NULL)
- panic("iommu_dvmamap_load: null map state");
-#endif
-#ifdef DEBUG
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_load: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_load: null iommu");
-#endif /* DEBUG */
- is = ims->ims_sb->sb_iommu;
-
- if (map->dm_nsegs) {
- /*
- * Is it still in use? _bus_dmamap_load should have taken care
- * of this.
- */
-#ifdef DIAGNOSTIC
- panic("iommu_dvmamap_load: map still in use");
-#endif
- bus_dmamap_unload(t0, map);
- }
-
  /*
  * Make sure that on error condition we return "no valid mappings".
  */
- map->dm_nsegs = 0;
+ KASSERTMSG(map->dm_nsegs == 0, "map still in use");
+
+ if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+ unsigned long bypass;
+ int i;
+
+ is = map->_dm_cookie;
+ bypass = is->is_hw->ihw_bypass;
+
+ /* Bypass translation by the IOMMU. */
+
+ BUS_DMA_FIND_PARENT(t, _dmamap_load);
+ err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p, flags);
+ if (err != 0)
+ return (err);
+
+ for (i = 0; i < map->dm_nsegs; i++)
+ map->dm_segs[i].ds_addr |= bypass;
+
+ return (0);
+ }
+
+ ims = map->_dm_cookie;
+ is = ims->ims_sb->sb_iommu;
 
  if (buflen < 1 || buflen > map->_dm_size) {
  DPRINTF(IDB_BUSDMA,
@@ -876,28 +891,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
  bus_size_t boundary, align;
  u_long dvmaddr, sgstart, sgend;
  struct iommu_state *is;
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
 
-#ifdef DIAGNOSTIC
- if (ims == NULL)
- panic("iommu_dvmamap_load_raw: null map state");
-#endif
-#ifdef DEBUG
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_load_raw: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_load_raw: null iommu");
-#endif /* DEBUG */
- is = ims->ims_sb->sb_iommu;
+ KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
 
- if (map->dm_nsegs) {
- /* Already in use?? */
-#ifdef DIAGNOSTIC
- panic("iommu_dvmamap_load_raw: map still in use");
-#endif
- bus_dmamap_unload(t0, map);
+ if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+ unsigned long bypass;
+
+ is = map->_dm_cookie;
+ bypass = is->is_hw->ihw_bypass;
+
+ /* Bypass translation by the IOMMU. */
+ for (i = 0; i < nsegs; i++) {
+ map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
+ map->dm_segs[i].ds_len = segs[i].ds_len;
+ }
+
+ map->dm_nsegs = nsegs;
+ map->dm_mapsize = size;
+
+ return (0);
  }
 
+ ims = map->_dm_cookie;
+ is = ims->ims_sb->sb_iommu;
+
  /*
  * A boundary presented to bus_dmamem_alloc() takes precedence
  * over boundary in the map.
@@ -1088,11 +1106,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
  bus_dma_segment_t *seg = NULL;
  int i = map->dm_nsegs;
 
-#ifdef DEBUG
- if (ims == NULL)
- panic("iommu_dvmamap_append_range: null map state");
-#endif
-
  sgstart = iommu_iomap_translate(ims, pa);
  sgend = sgstart + length - 1;
 
@@ -1298,20 +1311,17 @@ void
 iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map)
 {
  struct iommu_state *is;
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
  bus_addr_t dvmaddr = map->_dm_dvmastart;
  bus_size_t sgsize = map->_dm_dvmasize;
  int error;
 
-#ifdef DEBUG
- if (ims == NULL)
- panic("iommu_dvmamap_unload: null map state");
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_unload: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_unload: null iommu");
-#endif /* DEBUG */
+ if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+ bus_dmamap_unload(t->_parent, map);
+ return;
+ }
 
+ ims = map->_dm_cookie;
  is = ims->ims_sb->sb_iommu;
 
  /* Flush the iommu */
@@ -1488,7 +1498,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
  break;
  }
 
- if (map->_dm_cookie) {
+ if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie != NULL) {
  struct iommu_map_state *ims = map->_dm_cookie;
  struct iommu_page_map *ipm = &ims->ims_map;
 
@@ -1546,19 +1556,19 @@ void
 iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map,
     bus_addr_t offset, bus_size_t len, int ops)
 {
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
 
-#ifdef DIAGNOSTIC
- if (ims == NULL)
- panic("iommu_dvmamap_sync: null map state");
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_sync: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_sync: null iommu");
-#endif
  if (len == 0)
  return;
 
+ if (map->_dm_flags & BUS_DMA_64BIT) {
+ if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
+ membar(MemIssue);
+ return;
+ }
+
+ ims = map->_dm_cookie;
+
  if (ops & BUS_DMASYNC_PREWRITE)
  membar(MemIssue);
 
@@ -1622,9 +1632,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
     "bound %llx segp %p flags %d\n", (unsigned long long)size,
     (unsigned long long)alignment, (unsigned long long)boundary,
     segs, flags));
+
+ if ((flags & BUS_DMA_64BIT) == 0)
+ flags |= BUS_DMA_DVMA;
+
  BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
  return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
-    segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
+    segs, nsegs, rsegs, flags));
 }
 
 void
@@ -1763,7 +1777,7 @@ iommu_iomap_load_map(struct iommu_state
 
  /* Flush cache if necessary. */
  slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
- if (is->is_flags & IOMMU_FLUSH_CACHE &&
+ if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
     (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
  IOMMUREG_WRITE(is, iommu_cache_flush,
     is->is_ptsb + slot * 8);
@@ -1788,7 +1802,7 @@ iommu_iomap_unload_map(struct iommu_stat
 
  /* Flush cache if necessary. */
  slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
- if (is->is_flags & IOMMU_FLUSH_CACHE &&
+ if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
     (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
  IOMMUREG_WRITE(is, iommu_cache_flush,
     is->is_ptsb + slot * 8);
Index: dev/iommureg.h
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
retrieving revision 1.17
diff -u -p -r1.17 iommureg.h
--- dev/iommureg.h 17 Aug 2012 20:46:50 -0000 1.17
+++ dev/iommureg.h 8 May 2017 00:45:05 -0000
@@ -90,10 +90,10 @@ struct iommu_strbuf {
 #define IOMMUCR_DE 0x000000000000000002LL /* Diag enable */
 #define IOMMUCR_EN 0x000000000000000001LL /* Enable IOMMU */
 
-#define IOMMUCR_FIRE_SE 0x000000000000000400LL /* Snoop enable */
-#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode enable */
-#define IOMMUCR_FIRE_BE 0x000000000000000002LL /* Bypass enable */
-#define IOMMUCR_FIRE_TE 0x000000000000000001LL /* Translation enabled */
+#define IOMMUCR_FIRE_SE 0x000000000000000400UL /* Snoop enable */
+#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode enable */
+#define IOMMUCR_FIRE_BE 0x000000000000000002UL /* Bypass enable */
+#define IOMMUCR_FIRE_TE 0x000000000000000001UL /* Translation enabled */
 
 /*
  * IOMMU stuff
Index: dev/iommuvar.h
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
retrieving revision 1.17
diff -u -p -r1.17 iommuvar.h
--- dev/iommuvar.h 4 May 2016 18:26:12 -0000 1.17
+++ dev/iommuvar.h 8 May 2017 00:45:05 -0000
@@ -100,6 +100,22 @@ struct iommu_map_state {
 };
 #define IOMMU_MAP_STREAM 1
 
+struct iommu_hw {
+ void (*ihw_enable)(struct iommu_state *);
+
+ unsigned long ihw_dvma_pa;
+
+ unsigned long ihw_bypass;
+ unsigned long ihw_bypass_nc; /* non-cached */
+ unsigned long ihw_bypass_ro; /* relaxed ordering */
+
+ unsigned int ihw_flags;
+#define IOMMU_HW_FLUSH_CACHE (1 << 0)
+ int ihw_dma_flags;
+};
+
+extern const struct iommu_hw iommu_hw_default;
+
 /*
  * per-IOMMU state
  */
@@ -112,8 +128,7 @@ struct iommu_state {
  int64_t is_cr; /* Control register value */
  struct mutex is_mtx;
  struct extent *is_dvmamap; /* DVMA map for this instance */
- int is_flags;
-#define IOMMU_FLUSH_CACHE 0x00000001
+ const struct iommu_hw *is_hw;
 
  struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */
 
@@ -126,7 +141,8 @@ struct iommu_state {
 };
 
 /* interfaces for PCI/SBus code */
-void iommu_init(char *, struct iommu_state *, int, u_int32_t);
+void iommu_init(char *, const struct iommu_hw *, struct iommu_state *,
+    int, u_int32_t);
 void iommu_reset(struct iommu_state *);
 paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
 int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
@@ -146,6 +162,7 @@ int iommu_dvmamem_alloc(bus_dma_tag_t, b
     bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
 void iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t, bus_dma_segment_t *,
     int);
+
 
 #define IOMMUREG_READ(is, reg) \
  bus_space_read_8((is)->is_bustag, \
Index: dev/psycho.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
retrieving revision 1.74
diff -u -p -r1.74 psycho.c
--- dev/psycho.c 23 Aug 2016 03:28:01 -0000 1.74
+++ dev/psycho.c 8 May 2017 00:45:05 -0000
@@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
  panic("couldn't malloc iommu name");
  snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
 
- iommu_init(name, is, tsbsize, iobase);
+ iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
 }
 
 /*
Index: dev/pyro.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
retrieving revision 1.30
diff -u -p -r1.30 pyro.c
--- dev/pyro.c 20 Dec 2016 13:40:50 -0000 1.30
+++ dev/pyro.c 8 May 2017 00:45:05 -0000
@@ -131,6 +131,32 @@ int pyro_msi_eq_intr(void *);
 int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
     bus_size_t, bus_size_t, int, bus_dmamap_t *);
 
+void pyro_iommu_enable(struct iommu_state *);
+
+const struct iommu_hw iommu_hw_pyro = {
+ .ihw_enable = pyro_iommu_enable,
+
+ .ihw_dvma_pa = 0x7ffffffffffUL,
+
+ .ihw_bypass = 0xfffc000000000000UL,
+ .ihw_bypass_nc = 1UL << 43,
+ .ihw_bypass_ro = 0,
+
+ .ihw_dma_flags = BUS_DMA_64BIT,
+};
+
+const struct iommu_hw iommu_hw_oberon = {
+ .ihw_enable = pyro_iommu_enable,
+
+ .ihw_dvma_pa = 0x7fffffffffUL,
+
+ .ihw_bypass = 0x7ffc000000000000UL,
+ .ihw_bypass_nc = 1UL << 47,
+ .ihw_bypass_ro = 0x8000000000000000UL,
+
+ .ihw_flags = IOMMU_HW_FLUSH_CACHE,
+};
+
 #ifdef DDB
 void pyro_xir(void *, int);
 #endif
@@ -266,6 +292,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
  int tsbsize = 7;
  u_int32_t iobase = -1;
  char *name;
+ const struct iommu_hw *ihw = &iommu_hw_pyro;
 
  is->is_bustag = sc->sc_bust;
 
@@ -282,11 +309,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
  panic("couldn't malloc iommu name");
  snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
 
- /* On Oberon, we need to flush the cache. */
  if (sc->sc_oberon)
- is->is_flags |= IOMMU_FLUSH_CACHE;
+ ihw = &iommu_hw_oberon;
+
+ iommu_init(name, ihw, is, tsbsize, iobase);
+}
+
+void
+pyro_iommu_enable(struct iommu_state *is)
+{
+ unsigned long cr;
+
+ cr = IOMMUREG_READ(is, iommu_cr);
+ cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
+    IOMMUCR_FIRE_TE;
 
- iommu_init(name, is, tsbsize, iobase);
+ IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
+ IOMMUREG_WRITE(is, iommu_cr, cr);
 }
 
 void
Index: dev/sbus.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
retrieving revision 1.44
diff -u -p -r1.44 sbus.c
--- dev/sbus.c 19 Sep 2015 21:07:04 -0000 1.44
+++ dev/sbus.c 8 May 2017 00:45:05 -0000
@@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
  snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
 
  printf("%s: ", sc->sc_dev.dv_xname);
- iommu_init(name, &sc->sc_is, 0, -1);
+ iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
 
  /* Initialize Starfire PC interrupt translation. */
  if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
Index: dev/schizo.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
retrieving revision 1.67
diff -u -p -r1.67 schizo.c
--- dev/schizo.c 23 Aug 2016 03:28:01 -0000 1.67
+++ dev/schizo.c 8 May 2017 00:45:05 -0000
@@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
     "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
  }
 
- iommu_init(name, is, tsbsize, iobase);
+ iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
 }
 
 int

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

David Gwynne-5
On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:

> on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> setting up and tearing down the translation table entries (TTEs)
> is very expensive. so expensive that the cost of doing it for disk
> io has a noticable impact on compile times.
>
> now that there's a BUS_DMA_64BIT flag, we can use that to decide
> to bypass the iommu for devices that set that flag, therefore
> avoiding the cost of handling the TTEs.
>
> the following diff adds support for bypass mappings to the iommu
> code on sparc64. it's based on a diff from kettenis@ back in 2009.
> the main changes are around coping with the differences between
> schizo/psycho and fire/oberon.
>
> the differences between the chips are now represented by a iommu_hw
> struct. these differences include how to enable the iommu (now via
> a function pointer), and masks for bypass addresses.
>
> ive tested this on oberon (on an m4000) and schizo (on a v880).
> however, the bypass code isnt working on fire (v245s). to cope with
> that for now, the iommu_hw struct lets drivers mask flag bits that
> are handled when creating a dmamap. this means fire boards will
> ignore BUS_DMA_64BIT until i can figure out whats wrong with them.

i figured it out. it turns out Fire was working fine. however,
enabling 64bit dva on the onboard devices didnt work because the
serverworks/broadcom pcie to pcix bridge can only handle dma addresses
in the low 40 bits. because the fire bypass window is higher than
this, the bridge would choke and things stopped working.

the updated diff attempts to handle this. basically when probing
the bridge, the platform creates a custom dma tag for it. this tag
intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
handing it up to the parent bridge, which is pyro in my situation.
it looks like early sun4v boxes could make use of this too.

> i have not tested this on psycho yet. if anyone has such a machine
> and is willing to work with me to figure it out, please talk to me.

i still dont have psycho reports.

Index: dev/iommu.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
retrieving revision 1.74
diff -u -p -r1.74 iommu.c
--- dev/iommu.c 30 Apr 2017 16:45:45 -0000 1.74
+++ dev/iommu.c 10 May 2017 12:00:09 -0000
@@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
 void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
     bus_addr_t, bus_size_t, int);
 
+void iommu_hw_enable(struct iommu_state *);
+
+const struct iommu_hw iommu_hw_default = {
+ .ihw_enable = iommu_hw_enable,
+
+ .ihw_dvma_pa = IOTTE_PAMASK,
+
+ .ihw_bypass = 0x3fffUL << 50,
+ .ihw_bypass_nc = 0,
+ .ihw_bypass_ro = 0,
+};
+
+void
+iommu_hw_enable(struct iommu_state *is)
+{
+ IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
+ IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
+}
+
 /*
  * Initiate an STC entry flush.
  */
@@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
  * - create a private DVMA map.
  */
 void
-iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t iovabase)
+iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state *is,
+    int tsbsize, u_int32_t iovabase)
 {
  psize_t size;
  vaddr_t va;
@@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
  * be hard-wired, so we read the start and size from the PROM and
  * just use those values.
  */
- if (strncmp(name, "pyro", 4) == 0) {
- is->is_cr = IOMMUREG_READ(is, iommu_cr);
- is->is_cr &= ~IOMMUCR_FIRE_BE;
- is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
-    IOMMUCR_FIRE_TE);
- } else
- is->is_cr = IOMMUCR_EN;
+
+ is->is_hw = ihw;
+
  is->is_tsbsize = tsbsize;
  if (iovabase == (u_int32_t)-1) {
  is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
@@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
  mtx_init(&is->is_mtx, IPL_HIGH);
 
  /*
- * Set the TSB size.  The relevant bits were moved to the TSB
- * base register in the PCIe host bridges.
- */
- if (strncmp(name, "pyro", 4) == 0)
- is->is_ptsb |= is->is_tsbsize;
- else
- is->is_cr |= (is->is_tsbsize << 16);
-
- /*
  * Now actually start up the IOMMU.
  */
  iommu_reset(is);
@@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
 {
  int i;
 
- IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
-
- /* Enable IOMMU */
- IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
+ (*is->is_hw->ihw_enable)(is);
 
  for (i = 0; i < 2; ++i) {
  struct strbuf_ctl *sb = is->is_sb[i];
@@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
  printf(", STC%d enabled", i);
  }
 
- if (is->is_flags & IOMMU_FLUSH_CACHE)
+ if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
  IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
 }
 
@@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
  if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
  tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
 
- return (tte & IOTTE_PAMASK);
+ return (tte & is->is_hw->ihw_dvma_pa);
 }
 
 /*
@@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
 {
  int ret;
  bus_dmamap_t map;
+ struct iommu_state *is = sb->sb_iommu;
  struct iommu_map_state *ims;
 
  BUS_DMA_FIND_PARENT(t, _dmamap_create);
@@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
  if (ret)
  return (ret);
 
+ if (flags & BUS_DMA_64BIT) {
+ map->_dm_cookie = is;
+ *dmamap = map;
+ return (0);
+ }
+
  ims = iommu_iomap_create(atop(round_page(size)));
 
  if (ims == NULL) {
@@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
  if (map->dm_nsegs)
  bus_dmamap_unload(t0, map);
 
-        if (map->_dm_cookie)
-                iommu_iomap_destroy(map->_dm_cookie);
+ if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+        if (map->_dm_cookie)
+ iommu_iomap_destroy(map->_dm_cookie);
+ }
  map->_dm_cookie = NULL;
 
  BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
@@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
  u_long dvmaddr, sgstart, sgend;
  bus_size_t align, boundary;
  struct iommu_state *is;
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
  pmap_t pmap;
 
-#ifdef DIAGNOSTIC
- if (ims == NULL)
- panic("iommu_dvmamap_load: null map state");
-#endif
-#ifdef DEBUG
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_load: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_load: null iommu");
-#endif /* DEBUG */
- is = ims->ims_sb->sb_iommu;
-
- if (map->dm_nsegs) {
- /*
- * Is it still in use? _bus_dmamap_load should have taken care
- * of this.
- */
-#ifdef DIAGNOSTIC
- panic("iommu_dvmamap_load: map still in use");
-#endif
- bus_dmamap_unload(t0, map);
- }
-
  /*
  * Make sure that on error condition we return "no valid mappings".
  */
- map->dm_nsegs = 0;
+ KASSERTMSG(map->dm_nsegs == 0, "map still in use");
+
+ if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+ unsigned long bypass;
+ int i;
+
+ is = map->_dm_cookie;
+ bypass = is->is_hw->ihw_bypass;
+
+ /* Bypass translation by the IOMMU. */
+
+ BUS_DMA_FIND_PARENT(t, _dmamap_load);
+ err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p, flags);
+ if (err != 0)
+ return (err);
+
+ for (i = 0; i < map->dm_nsegs; i++)
+ map->dm_segs[i].ds_addr |= bypass;
+
+ return (0);
+ }
+
+ ims = map->_dm_cookie;
+ is = ims->ims_sb->sb_iommu;
 
  if (buflen < 1 || buflen > map->_dm_size) {
  DPRINTF(IDB_BUSDMA,
@@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
  bus_size_t boundary, align;
  u_long dvmaddr, sgstart, sgend;
  struct iommu_state *is;
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
 
-#ifdef DIAGNOSTIC
- if (ims == NULL)
- panic("iommu_dvmamap_load_raw: null map state");
-#endif
-#ifdef DEBUG
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_load_raw: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_load_raw: null iommu");
-#endif /* DEBUG */
- is = ims->ims_sb->sb_iommu;
+ KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
 
- if (map->dm_nsegs) {
- /* Already in use?? */
-#ifdef DIAGNOSTIC
- panic("iommu_dvmamap_load_raw: map still in use");
-#endif
- bus_dmamap_unload(t0, map);
+ if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+ unsigned long bypass;
+
+ is = map->_dm_cookie;
+ bypass = is->is_hw->ihw_bypass;
+
+ /* Bypass translation by the IOMMU. */
+ for (i = 0; i < nsegs; i++) {
+ map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
+ map->dm_segs[i].ds_len = segs[i].ds_len;
+ }
+
+ map->dm_nsegs = nsegs;
+ map->dm_mapsize = size;
+
+ return (0);
  }
 
+ ims = map->_dm_cookie;
+ is = ims->ims_sb->sb_iommu;
+
  /*
  * A boundary presented to bus_dmamem_alloc() takes precedence
  * over boundary in the map.
@@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
  bus_dma_segment_t *seg = NULL;
  int i = map->dm_nsegs;
 
-#ifdef DEBUG
- if (ims == NULL)
- panic("iommu_dvmamap_append_range: null map state");
-#endif
-
  sgstart = iommu_iomap_translate(ims, pa);
  sgend = sgstart + length - 1;
 
@@ -1298,20 +1309,17 @@ void
 iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map)
 {
  struct iommu_state *is;
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
  bus_addr_t dvmaddr = map->_dm_dvmastart;
  bus_size_t sgsize = map->_dm_dvmasize;
  int error;
 
-#ifdef DEBUG
- if (ims == NULL)
- panic("iommu_dvmamap_unload: null map state");
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_unload: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_unload: null iommu");
-#endif /* DEBUG */
+ if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
+ bus_dmamap_unload(t->_parent, map);
+ return;
+ }
 
+ ims = map->_dm_cookie;
  is = ims->ims_sb->sb_iommu;
 
  /* Flush the iommu */
@@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
  break;
  }
 
- if (map->_dm_cookie) {
+ if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie != NULL) {
  struct iommu_map_state *ims = map->_dm_cookie;
  struct iommu_page_map *ipm = &ims->ims_map;
 
@@ -1546,19 +1554,19 @@ void
 iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map,
     bus_addr_t offset, bus_size_t len, int ops)
 {
- struct iommu_map_state *ims = map->_dm_cookie;
+ struct iommu_map_state *ims;
 
-#ifdef DIAGNOSTIC
- if (ims == NULL)
- panic("iommu_dvmamap_sync: null map state");
- if (ims->ims_sb == NULL)
- panic("iommu_dvmamap_sync: null sb");
- if (ims->ims_sb->sb_iommu == NULL)
- panic("iommu_dvmamap_sync: null iommu");
-#endif
  if (len == 0)
  return;
 
+ if (map->_dm_flags & BUS_DMA_64BIT) {
+ if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
+ membar(MemIssue);
+ return;
+ }
+
+ ims = map->_dm_cookie;
+
  if (ops & BUS_DMASYNC_PREWRITE)
  membar(MemIssue);
 
@@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
     "bound %llx segp %p flags %d\n", (unsigned long long)size,
     (unsigned long long)alignment, (unsigned long long)boundary,
     segs, flags));
+
+ if ((flags & BUS_DMA_64BIT) == 0)
+ flags |= BUS_DMA_DVMA;
+
  BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
  return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
-    segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
+    segs, nsegs, rsegs, flags));
 }
 
 void
@@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
 
  /* Flush cache if necessary. */
  slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
- if (is->is_flags & IOMMU_FLUSH_CACHE &&
+ if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
     (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
  IOMMUREG_WRITE(is, iommu_cache_flush,
     is->is_ptsb + slot * 8);
@@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
 
  /* Flush cache if necessary. */
  slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
- if (is->is_flags & IOMMU_FLUSH_CACHE &&
+ if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
     (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
  IOMMUREG_WRITE(is, iommu_cache_flush,
     is->is_ptsb + slot * 8);
Index: dev/iommureg.h
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
retrieving revision 1.17
diff -u -p -r1.17 iommureg.h
--- dev/iommureg.h 17 Aug 2012 20:46:50 -0000 1.17
+++ dev/iommureg.h 10 May 2017 12:00:09 -0000
@@ -90,10 +90,11 @@ struct iommu_strbuf {
 #define IOMMUCR_DE 0x000000000000000002LL /* Diag enable */
 #define IOMMUCR_EN 0x000000000000000001LL /* Enable IOMMU */
 
-#define IOMMUCR_FIRE_SE 0x000000000000000400LL /* Snoop enable */
-#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode enable */
-#define IOMMUCR_FIRE_BE 0x000000000000000002LL /* Bypass enable */
-#define IOMMUCR_FIRE_TE 0x000000000000000001LL /* Translation enabled */
+#define IOMMUCR_FIRE_PD 0x000000000000001000UL /* Process disable */
+#define IOMMUCR_FIRE_SE 0x000000000000000400UL /* Snoop enable */
+#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode enable */
+#define IOMMUCR_FIRE_BE 0x000000000000000002UL /* Bypass enable */
+#define IOMMUCR_FIRE_TE 0x000000000000000001UL /* Translation enabled */
 
 /*
  * IOMMU stuff
Index: dev/iommuvar.h
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
retrieving revision 1.17
diff -u -p -r1.17 iommuvar.h
--- dev/iommuvar.h 4 May 2016 18:26:12 -0000 1.17
+++ dev/iommuvar.h 10 May 2017 12:00:09 -0000
@@ -100,6 +100,21 @@ struct iommu_map_state {
 };
 #define IOMMU_MAP_STREAM 1
 
+struct iommu_hw {
+ void (*ihw_enable)(struct iommu_state *);
+
+ unsigned long ihw_dvma_pa;
+
+ unsigned long ihw_bypass;
+ unsigned long ihw_bypass_nc; /* non-cached */
+ unsigned long ihw_bypass_ro; /* relaxed ordering */
+
+ unsigned int ihw_flags;
+#define IOMMU_HW_FLUSH_CACHE (1 << 0)
+};
+
+extern const struct iommu_hw iommu_hw_default;
+
 /*
  * per-IOMMU state
  */
@@ -112,8 +127,7 @@ struct iommu_state {
  int64_t is_cr; /* Control register value */
  struct mutex is_mtx;
  struct extent *is_dvmamap; /* DVMA map for this instance */
- int is_flags;
-#define IOMMU_FLUSH_CACHE 0x00000001
+ const struct iommu_hw *is_hw;
 
  struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */
 
@@ -126,7 +140,8 @@ struct iommu_state {
 };
 
 /* interfaces for PCI/SBus code */
-void iommu_init(char *, struct iommu_state *, int, u_int32_t);
+void iommu_init(char *, const struct iommu_hw *, struct iommu_state *,
+    int, u_int32_t);
 void iommu_reset(struct iommu_state *);
 paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
 int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
@@ -146,6 +161,7 @@ int iommu_dvmamem_alloc(bus_dma_tag_t, b
     bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
 void iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t, bus_dma_segment_t *,
     int);
+
 
 #define IOMMUREG_READ(is, reg) \
  bus_space_read_8((is)->is_bustag, \
Index: dev/pci_machdep.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
retrieving revision 1.44
diff -u -p -r1.44 pci_machdep.c
--- dev/pci_machdep.c 10 May 2014 12:15:19 -0000 1.44
+++ dev/pci_machdep.c 10 May 2017 12:00:09 -0000
@@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
 #include <machine/openfirm.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
+#include <dev/pci/pcidevs.h>
 
 #include <dev/ofw/ofw_pci.h>
 
@@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
  struct pcibus_attach_args *pba;
 {
  /* Don't do anything */
+}
+
+int
+pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t size,
+    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags,
+    bus_dmamap_t *dmamp)
+{
+ bus_dma_tag_t pdt = dt->_parent;
+
+ CLR(flags, BUS_DMA_64BIT);
+
+ return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
+    boundary, flags, dmamp));
+}
+
+int
+pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args *pa)
+{
+ bus_dma_tag_t dt, pdt;
+
+ if (pa->pa_id ==
+    PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
+ /*
+ * These PCI bridges only support 40bit DVA, so intercept
+ * bus_dmamap_create so we can clear BUS_DMA_64BIT.
+ */
+
+ dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (dt == NULL)
+ panic("%s: could not alloc dma tag", __func__);
+
+ pdt = pa->pa_dmat;
+
+ dt->_parent = pdt;
+ dt->_dmamap_create = pci_bcm_dmamap_create;
+
+ pa->pa_dmat = dt;
+ }
+
+ return (0);
 }
 
 int
Index: dev/psycho.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
retrieving revision 1.74
diff -u -p -r1.74 psycho.c
--- dev/psycho.c 23 Aug 2016 03:28:01 -0000 1.74
+++ dev/psycho.c 10 May 2017 12:00:09 -0000
@@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
  panic("couldn't malloc iommu name");
  snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
 
- iommu_init(name, is, tsbsize, iobase);
+ iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
 }
 
 /*
Index: dev/pyro.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
retrieving revision 1.30
diff -u -p -r1.30 pyro.c
--- dev/pyro.c 20 Dec 2016 13:40:50 -0000 1.30
+++ dev/pyro.c 10 May 2017 12:00:09 -0000
@@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
 int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
     bus_size_t, bus_size_t, int, bus_dmamap_t *);
 
+void pyro_iommu_enable(struct iommu_state *);
+
+const struct iommu_hw iommu_hw_fire = {
+ .ihw_enable = pyro_iommu_enable,
+
+ .ihw_dvma_pa = 0x000007ffffffffffUL,
+
+ .ihw_bypass = 0xfffc000000000000UL,
+ .ihw_bypass_nc = 0x0000080000000000UL,
+ .ihw_bypass_ro = 0,
+};
+
+const struct iommu_hw iommu_hw_oberon = {
+ .ihw_enable = pyro_iommu_enable,
+
+ .ihw_dvma_pa = 0x00007fffffffffffUL,
+
+ .ihw_bypass = 0x7ffc000000000000UL,
+ .ihw_bypass_nc = 0x0000800000000000UL,
+ .ihw_bypass_ro = 0x8000000000000000UL,
+
+ .ihw_flags = IOMMU_HW_FLUSH_CACHE,
+};
+
 #ifdef DDB
 void pyro_xir(void *, int);
 #endif
@@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
  int tsbsize = 7;
  u_int32_t iobase = -1;
  char *name;
+ const struct iommu_hw *ihw = &iommu_hw_fire;
 
  is->is_bustag = sc->sc_bust;
 
@@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
  panic("couldn't malloc iommu name");
  snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
 
- /* On Oberon, we need to flush the cache. */
  if (sc->sc_oberon)
- is->is_flags |= IOMMU_FLUSH_CACHE;
+ ihw = &iommu_hw_oberon;
+
+ iommu_init(name, ihw, is, tsbsize, iobase);
+}
+
+void
+pyro_iommu_enable(struct iommu_state *is)
+{
+ unsigned long cr;
+
+ cr = IOMMUREG_READ(is, iommu_cr);
+ cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
+    IOMMUCR_FIRE_TE;
 
- iommu_init(name, is, tsbsize, iobase);
+ IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
+ IOMMUREG_WRITE(is, iommu_cr, cr);
 }
 
 void
Index: dev/sbus.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
retrieving revision 1.44
diff -u -p -r1.44 sbus.c
--- dev/sbus.c 19 Sep 2015 21:07:04 -0000 1.44
+++ dev/sbus.c 10 May 2017 12:00:09 -0000
@@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
  snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
 
  printf("%s: ", sc->sc_dev.dv_xname);
- iommu_init(name, &sc->sc_is, 0, -1);
+ iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
 
  /* Initialize Starfire PC interrupt translation. */
  if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
Index: dev/schizo.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
retrieving revision 1.67
diff -u -p -r1.67 schizo.c
--- dev/schizo.c 23 Aug 2016 03:28:01 -0000 1.67
+++ dev/schizo.c 10 May 2017 12:00:09 -0000
@@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
     "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
  }
 
- iommu_init(name, is, tsbsize, iobase);
+ iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
 }
 
 int
Index: include/pci_machdep.h
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
retrieving revision 1.33
diff -u -p -r1.33 pci_machdep.h
--- include/pci_machdep.h 4 May 2016 14:30:01 -0000 1.33
+++ include/pci_machdep.h 10 May 2017 12:00:09 -0000
@@ -74,10 +74,13 @@ struct sparc_pci_chipset {
  pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
  void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
  int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
+ int (*probe_device_hook)(void *, struct pci_attach_args *);
 };
 
 void pci_attach_hook(struct device *, struct device *,
      struct pcibus_attach_args *);
+int pci_probe_device_hook(pci_chipset_tag_t,
+    struct pci_attach_args *);
 int pci_bus_maxdevs(pci_chipset_tag_t, int);
 pcitag_t pci_make_tag(pci_chipset_tag_t, int, int, int);
 void pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int *, int *,
@@ -102,8 +105,6 @@ int sparc64_pci_enumerate_bus(struct pc
     struct pci_attach_args *);
 
 #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
-
-#define pci_probe_device_hook(c, a) (0)
 
 #define pci_min_powerstate(c, t) (PCI_PMCSR_STATE_D3)
 #define pci_set_powerstate_md(c, t, s, p)

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

David Gwynne-5
On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:

> On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
> > on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> > setting up and tearing down the translation table entries (TTEs)
> > is very expensive. so expensive that the cost of doing it for disk
> > io has a noticable impact on compile times.
> >
> > now that there's a BUS_DMA_64BIT flag, we can use that to decide
> > to bypass the iommu for devices that set that flag, therefore
> > avoiding the cost of handling the TTEs.
> >
> > the following diff adds support for bypass mappings to the iommu
> > code on sparc64. it's based on a diff from kettenis@ back in 2009.
> > the main changes are around coping with the differences between
> > schizo/psycho and fire/oberon.
> >
> > the differences between the chips are now represented by a iommu_hw
> > struct. these differences include how to enable the iommu (now via
> > a function pointer), and masks for bypass addresses.
> >
> > ive tested this on oberon (on an m4000) and schizo (on a v880).
> > however, the bypass code isnt working on fire (v245s). to cope with
> > that for now, the iommu_hw struct lets drivers mask flag bits that
> > are handled when creating a dmamap. this means fire boards will
> > ignore BUS_DMA_64BIT until i can figure out whats wrong with them.
>
> i figured it out. it turns out Fire was working fine. however,
> enabling 64bit dva on the onboard devices didnt work because the
> serverworks/broadcom pcie to pcix bridge can only handle dma addresses
> in the low 40 bits. because the fire bypass window is higher than
> this, the bridge would choke and things stopped working.
>
> the updated diff attempts to handle this. basically when probing
> the bridge, the platform creates a custom dma tag for it. this tag
> intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
> handing it up to the parent bridge, which is pyro in my situation.
> it looks like early sun4v boxes could make use of this too.
>
> > i have not tested this on psycho yet. if anyone has such a machine
> > and is willing to work with me to figure it out, please talk to me.
>
> i still dont have psycho reports.

Would anyone object if I committed this? I've been running it for the
last release or two without issues, but with significant improvements in
performance on the machines involved.

> Index: dev/iommu.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
> retrieving revision 1.74
> diff -u -p -r1.74 iommu.c
> --- dev/iommu.c 30 Apr 2017 16:45:45 -0000 1.74
> +++ dev/iommu.c 10 May 2017 12:00:09 -0000
> @@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
>  void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
>      bus_addr_t, bus_size_t, int);
>  
> +void iommu_hw_enable(struct iommu_state *);
> +
> +const struct iommu_hw iommu_hw_default = {
> + .ihw_enable = iommu_hw_enable,
> +
> + .ihw_dvma_pa = IOTTE_PAMASK,
> +
> + .ihw_bypass = 0x3fffUL << 50,
> + .ihw_bypass_nc = 0,
> + .ihw_bypass_ro = 0,
> +};
> +
> +void
> +iommu_hw_enable(struct iommu_state *is)
> +{
> + IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> + IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
> +}
> +
>  /*
>   * Initiate an STC entry flush.
>   */
> @@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
>   * - create a private DVMA map.
>   */
>  void
> -iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t iovabase)
> +iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state *is,
> +    int tsbsize, u_int32_t iovabase)
>  {
>   psize_t size;
>   vaddr_t va;
> @@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
>   * be hard-wired, so we read the start and size from the PROM and
>   * just use those values.
>   */
> - if (strncmp(name, "pyro", 4) == 0) {
> - is->is_cr = IOMMUREG_READ(is, iommu_cr);
> - is->is_cr &= ~IOMMUCR_FIRE_BE;
> - is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> -    IOMMUCR_FIRE_TE);
> - } else
> - is->is_cr = IOMMUCR_EN;
> +
> + is->is_hw = ihw;
> +
>   is->is_tsbsize = tsbsize;
>   if (iovabase == (u_int32_t)-1) {
>   is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
> @@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
>   mtx_init(&is->is_mtx, IPL_HIGH);
>  
>   /*
> - * Set the TSB size.  The relevant bits were moved to the TSB
> - * base register in the PCIe host bridges.
> - */
> - if (strncmp(name, "pyro", 4) == 0)
> - is->is_ptsb |= is->is_tsbsize;
> - else
> - is->is_cr |= (is->is_tsbsize << 16);
> -
> - /*
>   * Now actually start up the IOMMU.
>   */
>   iommu_reset(is);
> @@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
>  {
>   int i;
>  
> - IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> -
> - /* Enable IOMMU */
> - IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
> + (*is->is_hw->ihw_enable)(is);
>  
>   for (i = 0; i < 2; ++i) {
>   struct strbuf_ctl *sb = is->is_sb[i];
> @@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
>   printf(", STC%d enabled", i);
>   }
>  
> - if (is->is_flags & IOMMU_FLUSH_CACHE)
> + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
>   IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
>  }
>  
> @@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
>   if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
>   tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
>  
> - return (tte & IOTTE_PAMASK);
> + return (tte & is->is_hw->ihw_dvma_pa);
>  }
>  
>  /*
> @@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
>  {
>   int ret;
>   bus_dmamap_t map;
> + struct iommu_state *is = sb->sb_iommu;
>   struct iommu_map_state *ims;
>  
>   BUS_DMA_FIND_PARENT(t, _dmamap_create);
> @@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
>   if (ret)
>   return (ret);
>  
> + if (flags & BUS_DMA_64BIT) {
> + map->_dm_cookie = is;
> + *dmamap = map;
> + return (0);
> + }
> +
>   ims = iommu_iomap_create(atop(round_page(size)));
>  
>   if (ims == NULL) {
> @@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
>   if (map->dm_nsegs)
>   bus_dmamap_unload(t0, map);
>  
> -        if (map->_dm_cookie)
> -                iommu_iomap_destroy(map->_dm_cookie);
> + if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> +        if (map->_dm_cookie)
> + iommu_iomap_destroy(map->_dm_cookie);
> + }
>   map->_dm_cookie = NULL;
>  
>   BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
> @@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
>   u_long dvmaddr, sgstart, sgend;
>   bus_size_t align, boundary;
>   struct iommu_state *is;
> - struct iommu_map_state *ims = map->_dm_cookie;
> + struct iommu_map_state *ims;
>   pmap_t pmap;
>  
> -#ifdef DIAGNOSTIC
> - if (ims == NULL)
> - panic("iommu_dvmamap_load: null map state");
> -#endif
> -#ifdef DEBUG
> - if (ims->ims_sb == NULL)
> - panic("iommu_dvmamap_load: null sb");
> - if (ims->ims_sb->sb_iommu == NULL)
> - panic("iommu_dvmamap_load: null iommu");
> -#endif /* DEBUG */
> - is = ims->ims_sb->sb_iommu;
> -
> - if (map->dm_nsegs) {
> - /*
> - * Is it still in use? _bus_dmamap_load should have taken care
> - * of this.
> - */
> -#ifdef DIAGNOSTIC
> - panic("iommu_dvmamap_load: map still in use");
> -#endif
> - bus_dmamap_unload(t0, map);
> - }
> -
>   /*
>   * Make sure that on error condition we return "no valid mappings".
>   */
> - map->dm_nsegs = 0;
> + KASSERTMSG(map->dm_nsegs == 0, "map still in use");
> +
> + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> + unsigned long bypass;
> + int i;
> +
> + is = map->_dm_cookie;
> + bypass = is->is_hw->ihw_bypass;
> +
> + /* Bypass translation by the IOMMU. */
> +
> + BUS_DMA_FIND_PARENT(t, _dmamap_load);
> + err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p, flags);
> + if (err != 0)
> + return (err);
> +
> + for (i = 0; i < map->dm_nsegs; i++)
> + map->dm_segs[i].ds_addr |= bypass;
> +
> + return (0);
> + }
> +
> + ims = map->_dm_cookie;
> + is = ims->ims_sb->sb_iommu;
>  
>   if (buflen < 1 || buflen > map->_dm_size) {
>   DPRINTF(IDB_BUSDMA,
> @@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
>   bus_size_t boundary, align;
>   u_long dvmaddr, sgstart, sgend;
>   struct iommu_state *is;
> - struct iommu_map_state *ims = map->_dm_cookie;
> + struct iommu_map_state *ims;
>  
> -#ifdef DIAGNOSTIC
> - if (ims == NULL)
> - panic("iommu_dvmamap_load_raw: null map state");
> -#endif
> -#ifdef DEBUG
> - if (ims->ims_sb == NULL)
> - panic("iommu_dvmamap_load_raw: null sb");
> - if (ims->ims_sb->sb_iommu == NULL)
> - panic("iommu_dvmamap_load_raw: null iommu");
> -#endif /* DEBUG */
> - is = ims->ims_sb->sb_iommu;
> + KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
>  
> - if (map->dm_nsegs) {
> - /* Already in use?? */
> -#ifdef DIAGNOSTIC
> - panic("iommu_dvmamap_load_raw: map still in use");
> -#endif
> - bus_dmamap_unload(t0, map);
> + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> + unsigned long bypass;
> +
> + is = map->_dm_cookie;
> + bypass = is->is_hw->ihw_bypass;
> +
> + /* Bypass translation by the IOMMU. */
> + for (i = 0; i < nsegs; i++) {
> + map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
> + map->dm_segs[i].ds_len = segs[i].ds_len;
> + }
> +
> + map->dm_nsegs = nsegs;
> + map->dm_mapsize = size;
> +
> + return (0);
>   }
>  
> + ims = map->_dm_cookie;
> + is = ims->ims_sb->sb_iommu;
> +
>   /*
>   * A boundary presented to bus_dmamem_alloc() takes precedence
>   * over boundary in the map.
> @@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
>   bus_dma_segment_t *seg = NULL;
>   int i = map->dm_nsegs;
>  
> -#ifdef DEBUG
> - if (ims == NULL)
> - panic("iommu_dvmamap_append_range: null map state");
> -#endif
> -
>   sgstart = iommu_iomap_translate(ims, pa);
>   sgend = sgstart + length - 1;
>  
> @@ -1298,20 +1309,17 @@ void
>  iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map)
>  {
>   struct iommu_state *is;
> - struct iommu_map_state *ims = map->_dm_cookie;
> + struct iommu_map_state *ims;
>   bus_addr_t dvmaddr = map->_dm_dvmastart;
>   bus_size_t sgsize = map->_dm_dvmasize;
>   int error;
>  
> -#ifdef DEBUG
> - if (ims == NULL)
> - panic("iommu_dvmamap_unload: null map state");
> - if (ims->ims_sb == NULL)
> - panic("iommu_dvmamap_unload: null sb");
> - if (ims->ims_sb->sb_iommu == NULL)
> - panic("iommu_dvmamap_unload: null iommu");
> -#endif /* DEBUG */
> + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> + bus_dmamap_unload(t->_parent, map);
> + return;
> + }
>  
> + ims = map->_dm_cookie;
>   is = ims->ims_sb->sb_iommu;
>  
>   /* Flush the iommu */
> @@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
>   break;
>   }
>  
> - if (map->_dm_cookie) {
> + if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie != NULL) {
>   struct iommu_map_state *ims = map->_dm_cookie;
>   struct iommu_page_map *ipm = &ims->ims_map;
>  
> @@ -1546,19 +1554,19 @@ void
>  iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map,
>      bus_addr_t offset, bus_size_t len, int ops)
>  {
> - struct iommu_map_state *ims = map->_dm_cookie;
> + struct iommu_map_state *ims;
>  
> -#ifdef DIAGNOSTIC
> - if (ims == NULL)
> - panic("iommu_dvmamap_sync: null map state");
> - if (ims->ims_sb == NULL)
> - panic("iommu_dvmamap_sync: null sb");
> - if (ims->ims_sb->sb_iommu == NULL)
> - panic("iommu_dvmamap_sync: null iommu");
> -#endif
>   if (len == 0)
>   return;
>  
> + if (map->_dm_flags & BUS_DMA_64BIT) {
> + if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
> + membar(MemIssue);
> + return;
> + }
> +
> + ims = map->_dm_cookie;
> +
>   if (ops & BUS_DMASYNC_PREWRITE)
>   membar(MemIssue);
>  
> @@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
>      "bound %llx segp %p flags %d\n", (unsigned long long)size,
>      (unsigned long long)alignment, (unsigned long long)boundary,
>      segs, flags));
> +
> + if ((flags & BUS_DMA_64BIT) == 0)
> + flags |= BUS_DMA_DVMA;
> +
>   BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
>   return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
> -    segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
> +    segs, nsegs, rsegs, flags));
>  }
>  
>  void
> @@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
>  
>   /* Flush cache if necessary. */
>   slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> - if (is->is_flags & IOMMU_FLUSH_CACHE &&
> + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
>      (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
>   IOMMUREG_WRITE(is, iommu_cache_flush,
>      is->is_ptsb + slot * 8);
> @@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
>  
>   /* Flush cache if necessary. */
>   slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> - if (is->is_flags & IOMMU_FLUSH_CACHE &&
> + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
>      (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
>   IOMMUREG_WRITE(is, iommu_cache_flush,
>      is->is_ptsb + slot * 8);
> Index: dev/iommureg.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
> retrieving revision 1.17
> diff -u -p -r1.17 iommureg.h
> --- dev/iommureg.h 17 Aug 2012 20:46:50 -0000 1.17
> +++ dev/iommureg.h 10 May 2017 12:00:09 -0000
> @@ -90,10 +90,11 @@ struct iommu_strbuf {
>  #define IOMMUCR_DE 0x000000000000000002LL /* Diag enable */
>  #define IOMMUCR_EN 0x000000000000000001LL /* Enable IOMMU */
>  
> -#define IOMMUCR_FIRE_SE 0x000000000000000400LL /* Snoop enable */
> -#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode enable */
> -#define IOMMUCR_FIRE_BE 0x000000000000000002LL /* Bypass enable */
> -#define IOMMUCR_FIRE_TE 0x000000000000000001LL /* Translation enabled */
> +#define IOMMUCR_FIRE_PD 0x000000000000001000UL /* Process disable */
> +#define IOMMUCR_FIRE_SE 0x000000000000000400UL /* Snoop enable */
> +#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode enable */
> +#define IOMMUCR_FIRE_BE 0x000000000000000002UL /* Bypass enable */
> +#define IOMMUCR_FIRE_TE 0x000000000000000001UL /* Translation enabled */
>  
>  /*
>   * IOMMU stuff
> Index: dev/iommuvar.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
> retrieving revision 1.17
> diff -u -p -r1.17 iommuvar.h
> --- dev/iommuvar.h 4 May 2016 18:26:12 -0000 1.17
> +++ dev/iommuvar.h 10 May 2017 12:00:09 -0000
> @@ -100,6 +100,21 @@ struct iommu_map_state {
>  };
>  #define IOMMU_MAP_STREAM 1
>  
> +struct iommu_hw {
> + void (*ihw_enable)(struct iommu_state *);
> +
> + unsigned long ihw_dvma_pa;
> +
> + unsigned long ihw_bypass;
> + unsigned long ihw_bypass_nc; /* non-cached */
> + unsigned long ihw_bypass_ro; /* relaxed ordering */
> +
> + unsigned int ihw_flags;
> +#define IOMMU_HW_FLUSH_CACHE (1 << 0)
> +};
> +
> +extern const struct iommu_hw iommu_hw_default;
> +
>  /*
>   * per-IOMMU state
>   */
> @@ -112,8 +127,7 @@ struct iommu_state {
>   int64_t is_cr; /* Control register value */
>   struct mutex is_mtx;
>   struct extent *is_dvmamap; /* DVMA map for this instance */
> - int is_flags;
> -#define IOMMU_FLUSH_CACHE 0x00000001
> + const struct iommu_hw *is_hw;
>  
>   struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */
>  
> @@ -126,7 +140,8 @@ struct iommu_state {
>  };
>  
>  /* interfaces for PCI/SBus code */
> -void iommu_init(char *, struct iommu_state *, int, u_int32_t);
> +void iommu_init(char *, const struct iommu_hw *, struct iommu_state *,
> +    int, u_int32_t);
>  void iommu_reset(struct iommu_state *);
>  paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
>  int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
> @@ -146,6 +161,7 @@ int iommu_dvmamem_alloc(bus_dma_tag_t, b
>      bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
>  void iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t, bus_dma_segment_t *,
>      int);
> +
>  
>  #define IOMMUREG_READ(is, reg) \
>   bus_space_read_8((is)->is_bustag, \
> Index: dev/pci_machdep.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 pci_machdep.c
> --- dev/pci_machdep.c 10 May 2014 12:15:19 -0000 1.44
> +++ dev/pci_machdep.c 10 May 2017 12:00:09 -0000
> @@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
>  #include <machine/openfirm.h>
>  #include <dev/pci/pcivar.h>
>  #include <dev/pci/pcireg.h>
> +#include <dev/pci/pcidevs.h>
>  
>  #include <dev/ofw/ofw_pci.h>
>  
> @@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
>   struct pcibus_attach_args *pba;
>  {
>   /* Don't do anything */
> +}
> +
> +int
> +pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t size,
> +    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags,
> +    bus_dmamap_t *dmamp)
> +{
> + bus_dma_tag_t pdt = dt->_parent;
> +
> + CLR(flags, BUS_DMA_64BIT);
> +
> + return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
> +    boundary, flags, dmamp));
> +}
> +
> +int
> +pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args *pa)
> +{
> + bus_dma_tag_t dt, pdt;
> +
> + if (pa->pa_id ==
> +    PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
> + /*
> + * These PCI bridges only support 40bit DVA, so intercept
> + * bus_dmamap_create so we can clear BUS_DMA_64BIT.
> + */
> +
> + dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
> + if (dt == NULL)
> + panic("%s: could not alloc dma tag", __func__);
> +
> + pdt = pa->pa_dmat;
> +
> + dt->_parent = pdt;
> + dt->_dmamap_create = pci_bcm_dmamap_create;
> +
> + pa->pa_dmat = dt;
> + }
> +
> + return (0);
>  }
>  
>  int
> Index: dev/psycho.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
> retrieving revision 1.74
> diff -u -p -r1.74 psycho.c
> --- dev/psycho.c 23 Aug 2016 03:28:01 -0000 1.74
> +++ dev/psycho.c 10 May 2017 12:00:09 -0000
> @@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
>   panic("couldn't malloc iommu name");
>   snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
>  
> - iommu_init(name, is, tsbsize, iobase);
> + iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
>  }
>  
>  /*
> Index: dev/pyro.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
> retrieving revision 1.30
> diff -u -p -r1.30 pyro.c
> --- dev/pyro.c 20 Dec 2016 13:40:50 -0000 1.30
> +++ dev/pyro.c 10 May 2017 12:00:09 -0000
> @@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
>  int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
>      bus_size_t, bus_size_t, int, bus_dmamap_t *);
>  
> +void pyro_iommu_enable(struct iommu_state *);
> +
> +const struct iommu_hw iommu_hw_fire = {
> + .ihw_enable = pyro_iommu_enable,
> +
> + .ihw_dvma_pa = 0x000007ffffffffffUL,
> +
> + .ihw_bypass = 0xfffc000000000000UL,
> + .ihw_bypass_nc = 0x0000080000000000UL,
> + .ihw_bypass_ro = 0,
> +};
> +
> +const struct iommu_hw iommu_hw_oberon = {
> + .ihw_enable = pyro_iommu_enable,
> +
> + .ihw_dvma_pa = 0x00007fffffffffffUL,
> +
> + .ihw_bypass = 0x7ffc000000000000UL,
> + .ihw_bypass_nc = 0x0000800000000000UL,
> + .ihw_bypass_ro = 0x8000000000000000UL,
> +
> + .ihw_flags = IOMMU_HW_FLUSH_CACHE,
> +};
> +
>  #ifdef DDB
>  void pyro_xir(void *, int);
>  #endif
> @@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
>   int tsbsize = 7;
>   u_int32_t iobase = -1;
>   char *name;
> + const struct iommu_hw *ihw = &iommu_hw_fire;
>  
>   is->is_bustag = sc->sc_bust;
>  
> @@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
>   panic("couldn't malloc iommu name");
>   snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
>  
> - /* On Oberon, we need to flush the cache. */
>   if (sc->sc_oberon)
> - is->is_flags |= IOMMU_FLUSH_CACHE;
> + ihw = &iommu_hw_oberon;
> +
> + iommu_init(name, ihw, is, tsbsize, iobase);
> +}
> +
> +void
> +pyro_iommu_enable(struct iommu_state *is)
> +{
> + unsigned long cr;
> +
> + cr = IOMMUREG_READ(is, iommu_cr);
> + cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> +    IOMMUCR_FIRE_TE;
>  
> - iommu_init(name, is, tsbsize, iobase);
> + IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
> + IOMMUREG_WRITE(is, iommu_cr, cr);
>  }
>  
>  void
> Index: dev/sbus.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 sbus.c
> --- dev/sbus.c 19 Sep 2015 21:07:04 -0000 1.44
> +++ dev/sbus.c 10 May 2017 12:00:09 -0000
> @@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
>   snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
>  
>   printf("%s: ", sc->sc_dev.dv_xname);
> - iommu_init(name, &sc->sc_is, 0, -1);
> + iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
>  
>   /* Initialize Starfire PC interrupt translation. */
>   if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
> Index: dev/schizo.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
> retrieving revision 1.67
> diff -u -p -r1.67 schizo.c
> --- dev/schizo.c 23 Aug 2016 03:28:01 -0000 1.67
> +++ dev/schizo.c 10 May 2017 12:00:09 -0000
> @@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
>      "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
>   }
>  
> - iommu_init(name, is, tsbsize, iobase);
> + iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
>  }
>  
>  int
> Index: include/pci_machdep.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
> retrieving revision 1.33
> diff -u -p -r1.33 pci_machdep.h
> --- include/pci_machdep.h 4 May 2016 14:30:01 -0000 1.33
> +++ include/pci_machdep.h 10 May 2017 12:00:09 -0000
> @@ -74,10 +74,13 @@ struct sparc_pci_chipset {
>   pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
>   void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
>   int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
> + int (*probe_device_hook)(void *, struct pci_attach_args *);
>  };
>  
>  void pci_attach_hook(struct device *, struct device *,
>       struct pcibus_attach_args *);
> +int pci_probe_device_hook(pci_chipset_tag_t,
> +    struct pci_attach_args *);
>  int pci_bus_maxdevs(pci_chipset_tag_t, int);
>  pcitag_t pci_make_tag(pci_chipset_tag_t, int, int, int);
>  void pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int *, int *,
> @@ -102,8 +105,6 @@ int sparc64_pci_enumerate_bus(struct pc
>      struct pci_attach_args *);
>  
>  #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
> -
> -#define pci_probe_device_hook(c, a) (0)
>  
>  #define pci_min_powerstate(c, t) (PCI_PMCSR_STATE_D3)
>  #define pci_set_powerstate_md(c, t, s, p)

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Mark Kettenis
> Date: Fri, 19 Oct 2018 10:22:30 +1000
> From: David Gwynne <[hidden email]>
>
> On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
> > On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
> > > on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> > > setting up and tearing down the translation table entries (TTEs)
> > > is very expensive. so expensive that the cost of doing it for disk
> > > io has a noticable impact on compile times.
> > >
> > > now that there's a BUS_DMA_64BIT flag, we can use that to decide
> > > to bypass the iommu for devices that set that flag, therefore
> > > avoiding the cost of handling the TTEs.
> > >
> > > the following diff adds support for bypass mappings to the iommu
> > > code on sparc64. it's based on a diff from kettenis@ back in 2009.
> > > the main changes are around coping with the differences between
> > > schizo/psycho and fire/oberon.
> > >
> > > the differences between the chips are now represented by a iommu_hw
> > > struct. these differences include how to enable the iommu (now via
> > > a function pointer), and masks for bypass addresses.
> > >
> > > ive tested this on oberon (on an m4000) and schizo (on a v880).
> > > however, the bypass code isnt working on fire (v245s). to cope with
> > > that for now, the iommu_hw struct lets drivers mask flag bits that
> > > are handled when creating a dmamap. this means fire boards will
> > > ignore BUS_DMA_64BIT until i can figure out whats wrong with them.
> >
> > i figured it out. it turns out Fire was working fine. however,
> > enabling 64bit dva on the onboard devices didnt work because the
> > serverworks/broadcom pcie to pcix bridge can only handle dma addresses
> > in the low 40 bits. because the fire bypass window is higher than
> > this, the bridge would choke and things stopped working.
> >
> > the updated diff attempts to handle this. basically when probing
> > the bridge, the platform creates a custom dma tag for it. this tag
> > intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
> > handing it up to the parent bridge, which is pyro in my situation.
> > it looks like early sun4v boxes could make use of this too.
> >
> > > i have not tested this on psycho yet. if anyone has such a machine
> > > and is willing to work with me to figure it out, please talk to me.
> >
> > i still dont have psycho reports.
>
> Would anyone object if I committed this? I've been running it for the
> last release or two without issues, but with significant improvements in
> performance on the machines involved.

At the price of giving all PCI devices unrestricted access to memory.

So I'm not eager to this, especially since on sun4v hardware bypassing
the iommu isn't possible as soon as multiple domains are enabled.  And
we lose a useful diagnostic when developing drivers.  Are you sure the
iommu overhead can't be reduced some other way?  At some point we
probably want to add iommu support on amd64 and arm64, but if that
comes with a similar overhead as on sparc64 that's going to be a bit
of an issue.

> > Index: dev/iommu.c
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
> > retrieving revision 1.74
> > diff -u -p -r1.74 iommu.c
> > --- dev/iommu.c 30 Apr 2017 16:45:45 -0000 1.74
> > +++ dev/iommu.c 10 May 2017 12:00:09 -0000
> > @@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
> >  void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
> >      bus_addr_t, bus_size_t, int);
> >  
> > +void iommu_hw_enable(struct iommu_state *);
> > +
> > +const struct iommu_hw iommu_hw_default = {
> > + .ihw_enable = iommu_hw_enable,
> > +
> > + .ihw_dvma_pa = IOTTE_PAMASK,
> > +
> > + .ihw_bypass = 0x3fffUL << 50,
> > + .ihw_bypass_nc = 0,
> > + .ihw_bypass_ro = 0,
> > +};
> > +
> > +void
> > +iommu_hw_enable(struct iommu_state *is)
> > +{
> > + IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> > + IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
> > +}
> > +
> >  /*
> >   * Initiate an STC entry flush.
> >   */
> > @@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
> >   * - create a private DVMA map.
> >   */
> >  void
> > -iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t iovabase)
> > +iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state *is,
> > +    int tsbsize, u_int32_t iovabase)
> >  {
> >   psize_t size;
> >   vaddr_t va;
> > @@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
> >   * be hard-wired, so we read the start and size from the PROM and
> >   * just use those values.
> >   */
> > - if (strncmp(name, "pyro", 4) == 0) {
> > - is->is_cr = IOMMUREG_READ(is, iommu_cr);
> > - is->is_cr &= ~IOMMUCR_FIRE_BE;
> > - is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> > -    IOMMUCR_FIRE_TE);
> > - } else
> > - is->is_cr = IOMMUCR_EN;
> > +
> > + is->is_hw = ihw;
> > +
> >   is->is_tsbsize = tsbsize;
> >   if (iovabase == (u_int32_t)-1) {
> >   is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
> > @@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
> >   mtx_init(&is->is_mtx, IPL_HIGH);
> >  
> >   /*
> > - * Set the TSB size.  The relevant bits were moved to the TSB
> > - * base register in the PCIe host bridges.
> > - */
> > - if (strncmp(name, "pyro", 4) == 0)
> > - is->is_ptsb |= is->is_tsbsize;
> > - else
> > - is->is_cr |= (is->is_tsbsize << 16);
> > -
> > - /*
> >   * Now actually start up the IOMMU.
> >   */
> >   iommu_reset(is);
> > @@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
> >  {
> >   int i;
> >  
> > - IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> > -
> > - /* Enable IOMMU */
> > - IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
> > + (*is->is_hw->ihw_enable)(is);
> >  
> >   for (i = 0; i < 2; ++i) {
> >   struct strbuf_ctl *sb = is->is_sb[i];
> > @@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
> >   printf(", STC%d enabled", i);
> >   }
> >  
> > - if (is->is_flags & IOMMU_FLUSH_CACHE)
> > + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
> >   IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
> >  }
> >  
> > @@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
> >   if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
> >   tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
> >  
> > - return (tte & IOTTE_PAMASK);
> > + return (tte & is->is_hw->ihw_dvma_pa);
> >  }
> >  
> >  /*
> > @@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
> >  {
> >   int ret;
> >   bus_dmamap_t map;
> > + struct iommu_state *is = sb->sb_iommu;
> >   struct iommu_map_state *ims;
> >  
> >   BUS_DMA_FIND_PARENT(t, _dmamap_create);
> > @@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
> >   if (ret)
> >   return (ret);
> >  
> > + if (flags & BUS_DMA_64BIT) {
> > + map->_dm_cookie = is;
> > + *dmamap = map;
> > + return (0);
> > + }
> > +
> >   ims = iommu_iomap_create(atop(round_page(size)));
> >  
> >   if (ims == NULL) {
> > @@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
> >   if (map->dm_nsegs)
> >   bus_dmamap_unload(t0, map);
> >  
> > -        if (map->_dm_cookie)
> > -                iommu_iomap_destroy(map->_dm_cookie);
> > + if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > +        if (map->_dm_cookie)
> > + iommu_iomap_destroy(map->_dm_cookie);
> > + }
> >   map->_dm_cookie = NULL;
> >  
> >   BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
> > @@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
> >   u_long dvmaddr, sgstart, sgend;
> >   bus_size_t align, boundary;
> >   struct iommu_state *is;
> > - struct iommu_map_state *ims = map->_dm_cookie;
> > + struct iommu_map_state *ims;
> >   pmap_t pmap;
> >  
> > -#ifdef DIAGNOSTIC
> > - if (ims == NULL)
> > - panic("iommu_dvmamap_load: null map state");
> > -#endif
> > -#ifdef DEBUG
> > - if (ims->ims_sb == NULL)
> > - panic("iommu_dvmamap_load: null sb");
> > - if (ims->ims_sb->sb_iommu == NULL)
> > - panic("iommu_dvmamap_load: null iommu");
> > -#endif /* DEBUG */
> > - is = ims->ims_sb->sb_iommu;
> > -
> > - if (map->dm_nsegs) {
> > - /*
> > - * Is it still in use? _bus_dmamap_load should have taken care
> > - * of this.
> > - */
> > -#ifdef DIAGNOSTIC
> > - panic("iommu_dvmamap_load: map still in use");
> > -#endif
> > - bus_dmamap_unload(t0, map);
> > - }
> > -
> >   /*
> >   * Make sure that on error condition we return "no valid mappings".
> >   */
> > - map->dm_nsegs = 0;
> > + KASSERTMSG(map->dm_nsegs == 0, "map still in use");
> > +
> > + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > + unsigned long bypass;
> > + int i;
> > +
> > + is = map->_dm_cookie;
> > + bypass = is->is_hw->ihw_bypass;
> > +
> > + /* Bypass translation by the IOMMU. */
> > +
> > + BUS_DMA_FIND_PARENT(t, _dmamap_load);
> > + err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p, flags);
> > + if (err != 0)
> > + return (err);
> > +
> > + for (i = 0; i < map->dm_nsegs; i++)
> > + map->dm_segs[i].ds_addr |= bypass;
> > +
> > + return (0);
> > + }
> > +
> > + ims = map->_dm_cookie;
> > + is = ims->ims_sb->sb_iommu;
> >  
> >   if (buflen < 1 || buflen > map->_dm_size) {
> >   DPRINTF(IDB_BUSDMA,
> > @@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
> >   bus_size_t boundary, align;
> >   u_long dvmaddr, sgstart, sgend;
> >   struct iommu_state *is;
> > - struct iommu_map_state *ims = map->_dm_cookie;
> > + struct iommu_map_state *ims;
> >  
> > -#ifdef DIAGNOSTIC
> > - if (ims == NULL)
> > - panic("iommu_dvmamap_load_raw: null map state");
> > -#endif
> > -#ifdef DEBUG
> > - if (ims->ims_sb == NULL)
> > - panic("iommu_dvmamap_load_raw: null sb");
> > - if (ims->ims_sb->sb_iommu == NULL)
> > - panic("iommu_dvmamap_load_raw: null iommu");
> > -#endif /* DEBUG */
> > - is = ims->ims_sb->sb_iommu;
> > + KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
> >  
> > - if (map->dm_nsegs) {
> > - /* Already in use?? */
> > -#ifdef DIAGNOSTIC
> > - panic("iommu_dvmamap_load_raw: map still in use");
> > -#endif
> > - bus_dmamap_unload(t0, map);
> > + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > + unsigned long bypass;
> > +
> > + is = map->_dm_cookie;
> > + bypass = is->is_hw->ihw_bypass;
> > +
> > + /* Bypass translation by the IOMMU. */
> > + for (i = 0; i < nsegs; i++) {
> > + map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
> > + map->dm_segs[i].ds_len = segs[i].ds_len;
> > + }
> > +
> > + map->dm_nsegs = nsegs;
> > + map->dm_mapsize = size;
> > +
> > + return (0);
> >   }
> >  
> > + ims = map->_dm_cookie;
> > + is = ims->ims_sb->sb_iommu;
> > +
> >   /*
> >   * A boundary presented to bus_dmamem_alloc() takes precedence
> >   * over boundary in the map.
> > @@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
> >   bus_dma_segment_t *seg = NULL;
> >   int i = map->dm_nsegs;
> >  
> > -#ifdef DEBUG
> > - if (ims == NULL)
> > - panic("iommu_dvmamap_append_range: null map state");
> > -#endif
> > -
> >   sgstart = iommu_iomap_translate(ims, pa);
> >   sgend = sgstart + length - 1;
> >  
> > @@ -1298,20 +1309,17 @@ void
> >  iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map)
> >  {
> >   struct iommu_state *is;
> > - struct iommu_map_state *ims = map->_dm_cookie;
> > + struct iommu_map_state *ims;
> >   bus_addr_t dvmaddr = map->_dm_dvmastart;
> >   bus_size_t sgsize = map->_dm_dvmasize;
> >   int error;
> >  
> > -#ifdef DEBUG
> > - if (ims == NULL)
> > - panic("iommu_dvmamap_unload: null map state");
> > - if (ims->ims_sb == NULL)
> > - panic("iommu_dvmamap_unload: null sb");
> > - if (ims->ims_sb->sb_iommu == NULL)
> > - panic("iommu_dvmamap_unload: null iommu");
> > -#endif /* DEBUG */
> > + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > + bus_dmamap_unload(t->_parent, map);
> > + return;
> > + }
> >  
> > + ims = map->_dm_cookie;
> >   is = ims->ims_sb->sb_iommu;
> >  
> >   /* Flush the iommu */
> > @@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
> >   break;
> >   }
> >  
> > - if (map->_dm_cookie) {
> > + if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie != NULL) {
> >   struct iommu_map_state *ims = map->_dm_cookie;
> >   struct iommu_page_map *ipm = &ims->ims_map;
> >  
> > @@ -1546,19 +1554,19 @@ void
> >  iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map,
> >      bus_addr_t offset, bus_size_t len, int ops)
> >  {
> > - struct iommu_map_state *ims = map->_dm_cookie;
> > + struct iommu_map_state *ims;
> >  
> > -#ifdef DIAGNOSTIC
> > - if (ims == NULL)
> > - panic("iommu_dvmamap_sync: null map state");
> > - if (ims->ims_sb == NULL)
> > - panic("iommu_dvmamap_sync: null sb");
> > - if (ims->ims_sb->sb_iommu == NULL)
> > - panic("iommu_dvmamap_sync: null iommu");
> > -#endif
> >   if (len == 0)
> >   return;
> >  
> > + if (map->_dm_flags & BUS_DMA_64BIT) {
> > + if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
> > + membar(MemIssue);
> > + return;
> > + }
> > +
> > + ims = map->_dm_cookie;
> > +
> >   if (ops & BUS_DMASYNC_PREWRITE)
> >   membar(MemIssue);
> >  
> > @@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
> >      "bound %llx segp %p flags %d\n", (unsigned long long)size,
> >      (unsigned long long)alignment, (unsigned long long)boundary,
> >      segs, flags));
> > +
> > + if ((flags & BUS_DMA_64BIT) == 0)
> > + flags |= BUS_DMA_DVMA;
> > +
> >   BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
> >   return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
> > -    segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
> > +    segs, nsegs, rsegs, flags));
> >  }
> >  
> >  void
> > @@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
> >  
> >   /* Flush cache if necessary. */
> >   slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> > - if (is->is_flags & IOMMU_FLUSH_CACHE &&
> > + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
> >      (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
> >   IOMMUREG_WRITE(is, iommu_cache_flush,
> >      is->is_ptsb + slot * 8);
> > @@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
> >  
> >   /* Flush cache if necessary. */
> >   slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> > - if (is->is_flags & IOMMU_FLUSH_CACHE &&
> > + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
> >      (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
> >   IOMMUREG_WRITE(is, iommu_cache_flush,
> >      is->is_ptsb + slot * 8);
> > Index: dev/iommureg.h
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
> > retrieving revision 1.17
> > diff -u -p -r1.17 iommureg.h
> > --- dev/iommureg.h 17 Aug 2012 20:46:50 -0000 1.17
> > +++ dev/iommureg.h 10 May 2017 12:00:09 -0000
> > @@ -90,10 +90,11 @@ struct iommu_strbuf {
> >  #define IOMMUCR_DE 0x000000000000000002LL /* Diag enable */
> >  #define IOMMUCR_EN 0x000000000000000001LL /* Enable IOMMU */
> >  
> > -#define IOMMUCR_FIRE_SE 0x000000000000000400LL /* Snoop enable */
> > -#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode enable */
> > -#define IOMMUCR_FIRE_BE 0x000000000000000002LL /* Bypass enable */
> > -#define IOMMUCR_FIRE_TE 0x000000000000000001LL /* Translation enabled */
> > +#define IOMMUCR_FIRE_PD 0x000000000000001000UL /* Process disable */
> > +#define IOMMUCR_FIRE_SE 0x000000000000000400UL /* Snoop enable */
> > +#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode enable */
> > +#define IOMMUCR_FIRE_BE 0x000000000000000002UL /* Bypass enable */
> > +#define IOMMUCR_FIRE_TE 0x000000000000000001UL /* Translation enabled */
> >  
> >  /*
> >   * IOMMU stuff
> > Index: dev/iommuvar.h
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
> > retrieving revision 1.17
> > diff -u -p -r1.17 iommuvar.h
> > --- dev/iommuvar.h 4 May 2016 18:26:12 -0000 1.17
> > +++ dev/iommuvar.h 10 May 2017 12:00:09 -0000
> > @@ -100,6 +100,21 @@ struct iommu_map_state {
> >  };
> >  #define IOMMU_MAP_STREAM 1
> >  
> > +struct iommu_hw {
> > + void (*ihw_enable)(struct iommu_state *);
> > +
> > + unsigned long ihw_dvma_pa;
> > +
> > + unsigned long ihw_bypass;
> > + unsigned long ihw_bypass_nc; /* non-cached */
> > + unsigned long ihw_bypass_ro; /* relaxed ordering */
> > +
> > + unsigned int ihw_flags;
> > +#define IOMMU_HW_FLUSH_CACHE (1 << 0)
> > +};
> > +
> > +extern const struct iommu_hw iommu_hw_default;
> > +
> >  /*
> >   * per-IOMMU state
> >   */
> > @@ -112,8 +127,7 @@ struct iommu_state {
> >   int64_t is_cr; /* Control register value */
> >   struct mutex is_mtx;
> >   struct extent *is_dvmamap; /* DVMA map for this instance */
> > - int is_flags;
> > -#define IOMMU_FLUSH_CACHE 0x00000001
> > + const struct iommu_hw *is_hw;
> >  
> >   struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */
> >  
> > @@ -126,7 +140,8 @@ struct iommu_state {
> >  };
> >  
> >  /* interfaces for PCI/SBus code */
> > -void iommu_init(char *, struct iommu_state *, int, u_int32_t);
> > +void iommu_init(char *, const struct iommu_hw *, struct iommu_state *,
> > +    int, u_int32_t);
> >  void iommu_reset(struct iommu_state *);
> >  paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
> >  int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
> > @@ -146,6 +161,7 @@ int iommu_dvmamem_alloc(bus_dma_tag_t, b
> >      bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
> >  void iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t, bus_dma_segment_t *,
> >      int);
> > +
> >  
> >  #define IOMMUREG_READ(is, reg) \
> >   bus_space_read_8((is)->is_bustag, \
> > Index: dev/pci_machdep.c
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
> > retrieving revision 1.44
> > diff -u -p -r1.44 pci_machdep.c
> > --- dev/pci_machdep.c 10 May 2014 12:15:19 -0000 1.44
> > +++ dev/pci_machdep.c 10 May 2017 12:00:09 -0000
> > @@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
> >  #include <machine/openfirm.h>
> >  #include <dev/pci/pcivar.h>
> >  #include <dev/pci/pcireg.h>
> > +#include <dev/pci/pcidevs.h>
> >  
> >  #include <dev/ofw/ofw_pci.h>
> >  
> > @@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
> >   struct pcibus_attach_args *pba;
> >  {
> >   /* Don't do anything */
> > +}
> > +
> > +int
> > +pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t size,
> > +    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags,
> > +    bus_dmamap_t *dmamp)
> > +{
> > + bus_dma_tag_t pdt = dt->_parent;
> > +
> > + CLR(flags, BUS_DMA_64BIT);
> > +
> > + return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
> > +    boundary, flags, dmamp));
> > +}
> > +
> > +int
> > +pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args *pa)
> > +{
> > + bus_dma_tag_t dt, pdt;
> > +
> > + if (pa->pa_id ==
> > +    PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
> > + /*
> > + * These PCI bridges only support 40bit DVA, so intercept
> > + * bus_dmamap_create so we can clear BUS_DMA_64BIT.
> > + */
> > +
> > + dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
> > + if (dt == NULL)
> > + panic("%s: could not alloc dma tag", __func__);
> > +
> > + pdt = pa->pa_dmat;
> > +
> > + dt->_parent = pdt;
> > + dt->_dmamap_create = pci_bcm_dmamap_create;
> > +
> > + pa->pa_dmat = dt;
> > + }
> > +
> > + return (0);
> >  }
> >  
> >  int
> > Index: dev/psycho.c
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
> > retrieving revision 1.74
> > diff -u -p -r1.74 psycho.c
> > --- dev/psycho.c 23 Aug 2016 03:28:01 -0000 1.74
> > +++ dev/psycho.c 10 May 2017 12:00:09 -0000
> > @@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
> >   panic("couldn't malloc iommu name");
> >   snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
> >  
> > - iommu_init(name, is, tsbsize, iobase);
> > + iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
> >  }
> >  
> >  /*
> > Index: dev/pyro.c
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
> > retrieving revision 1.30
> > diff -u -p -r1.30 pyro.c
> > --- dev/pyro.c 20 Dec 2016 13:40:50 -0000 1.30
> > +++ dev/pyro.c 10 May 2017 12:00:09 -0000
> > @@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
> >  int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
> >      bus_size_t, bus_size_t, int, bus_dmamap_t *);
> >  
> > +void pyro_iommu_enable(struct iommu_state *);
> > +
> > +const struct iommu_hw iommu_hw_fire = {
> > + .ihw_enable = pyro_iommu_enable,
> > +
> > + .ihw_dvma_pa = 0x000007ffffffffffUL,
> > +
> > + .ihw_bypass = 0xfffc000000000000UL,
> > + .ihw_bypass_nc = 0x0000080000000000UL,
> > + .ihw_bypass_ro = 0,
> > +};
> > +
> > +const struct iommu_hw iommu_hw_oberon = {
> > + .ihw_enable = pyro_iommu_enable,
> > +
> > + .ihw_dvma_pa = 0x00007fffffffffffUL,
> > +
> > + .ihw_bypass = 0x7ffc000000000000UL,
> > + .ihw_bypass_nc = 0x0000800000000000UL,
> > + .ihw_bypass_ro = 0x8000000000000000UL,
> > +
> > + .ihw_flags = IOMMU_HW_FLUSH_CACHE,
> > +};
> > +
> >  #ifdef DDB
> >  void pyro_xir(void *, int);
> >  #endif
> > @@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
> >   int tsbsize = 7;
> >   u_int32_t iobase = -1;
> >   char *name;
> > + const struct iommu_hw *ihw = &iommu_hw_fire;
> >  
> >   is->is_bustag = sc->sc_bust;
> >  
> > @@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
> >   panic("couldn't malloc iommu name");
> >   snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
> >  
> > - /* On Oberon, we need to flush the cache. */
> >   if (sc->sc_oberon)
> > - is->is_flags |= IOMMU_FLUSH_CACHE;
> > + ihw = &iommu_hw_oberon;
> > +
> > + iommu_init(name, ihw, is, tsbsize, iobase);
> > +}
> > +
> > +void
> > +pyro_iommu_enable(struct iommu_state *is)
> > +{
> > + unsigned long cr;
> > +
> > + cr = IOMMUREG_READ(is, iommu_cr);
> > + cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> > +    IOMMUCR_FIRE_TE;
> >  
> > - iommu_init(name, is, tsbsize, iobase);
> > + IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
> > + IOMMUREG_WRITE(is, iommu_cr, cr);
> >  }
> >  
> >  void
> > Index: dev/sbus.c
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
> > retrieving revision 1.44
> > diff -u -p -r1.44 sbus.c
> > --- dev/sbus.c 19 Sep 2015 21:07:04 -0000 1.44
> > +++ dev/sbus.c 10 May 2017 12:00:09 -0000
> > @@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
> >   snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
> >  
> >   printf("%s: ", sc->sc_dev.dv_xname);
> > - iommu_init(name, &sc->sc_is, 0, -1);
> > + iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
> >  
> >   /* Initialize Starfire PC interrupt translation. */
> >   if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
> > Index: dev/schizo.c
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
> > retrieving revision 1.67
> > diff -u -p -r1.67 schizo.c
> > --- dev/schizo.c 23 Aug 2016 03:28:01 -0000 1.67
> > +++ dev/schizo.c 10 May 2017 12:00:09 -0000
> > @@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
> >      "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
> >   }
> >  
> > - iommu_init(name, is, tsbsize, iobase);
> > + iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
> >  }
> >  
> >  int
> > Index: include/pci_machdep.h
> > ===================================================================
> > RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
> > retrieving revision 1.33
> > diff -u -p -r1.33 pci_machdep.h
> > --- include/pci_machdep.h 4 May 2016 14:30:01 -0000 1.33
> > +++ include/pci_machdep.h 10 May 2017 12:00:09 -0000
> > @@ -74,10 +74,13 @@ struct sparc_pci_chipset {
> >   pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
> >   void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
> >   int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
> > + int (*probe_device_hook)(void *, struct pci_attach_args *);
> >  };
> >  
> >  void pci_attach_hook(struct device *, struct device *,
> >       struct pcibus_attach_args *);
> > +int pci_probe_device_hook(pci_chipset_tag_t,
> > +    struct pci_attach_args *);
> >  int pci_bus_maxdevs(pci_chipset_tag_t, int);
> >  pcitag_t pci_make_tag(pci_chipset_tag_t, int, int, int);
> >  void pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int *, int *,
> > @@ -102,8 +105,6 @@ int sparc64_pci_enumerate_bus(struct pc
> >      struct pci_attach_args *);
> >  
> >  #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
> > -
> > -#define pci_probe_device_hook(c, a) (0)
> >  
> >  #define pci_min_powerstate(c, t) (PCI_PMCSR_STATE_D3)
> >  #define pci_set_powerstate_md(c, t, s, p)
>
>

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Andrew Grillet
Is the setup and teardown per transfer or when file is opened and closed?
Or is it set up once per context switch of task?

I am partly interested cos I would like to improve mt one day (as user of
tape
and Sparc64 Txxx) if I get the time.

Andrew



On Fri, 19 Oct 2018 at 10:22, Mark Kettenis <[hidden email]> wrote:

> > Date: Fri, 19 Oct 2018 10:22:30 +1000
> > From: David Gwynne <[hidden email]>
> >
> > On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
> > > On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
> > > > on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> > > > setting up and tearing down the translation table entries (TTEs)
> > > > is very expensive. so expensive that the cost of doing it for disk
> > > > io has a noticable impact on compile times.
> > > >
> > > > now that there's a BUS_DMA_64BIT flag, we can use that to decide
> > > > to bypass the iommu for devices that set that flag, therefore
> > > > avoiding the cost of handling the TTEs.
> > > >
> > > > the following diff adds support for bypass mappings to the iommu
> > > > code on sparc64. it's based on a diff from kettenis@ back in 2009.
> > > > the main changes are around coping with the differences between
> > > > schizo/psycho and fire/oberon.
> > > >
> > > > the differences between the chips are now represented by a iommu_hw
> > > > struct. these differences include how to enable the iommu (now via
> > > > a function pointer), and masks for bypass addresses.
> > > >
> > > > ive tested this on oberon (on an m4000) and schizo (on a v880).
> > > > however, the bypass code isnt working on fire (v245s). to cope with
> > > > that for now, the iommu_hw struct lets drivers mask flag bits that
> > > > are handled when creating a dmamap. this means fire boards will
> > > > ignore BUS_DMA_64BIT until i can figure out whats wrong with them.
> > >
> > > i figured it out. it turns out Fire was working fine. however,
> > > enabling 64bit dva on the onboard devices didnt work because the
> > > serverworks/broadcom pcie to pcix bridge can only handle dma addresses
> > > in the low 40 bits. because the fire bypass window is higher than
> > > this, the bridge would choke and things stopped working.
> > >
> > > the updated diff attempts to handle this. basically when probing
> > > the bridge, the platform creates a custom dma tag for it. this tag
> > > intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
> > > handing it up to the parent bridge, which is pyro in my situation.
> > > it looks like early sun4v boxes could make use of this too.
> > >
> > > > i have not tested this on psycho yet. if anyone has such a machine
> > > > and is willing to work with me to figure it out, please talk to me.
> > >
> > > i still dont have psycho reports.
> >
> > Would anyone object if I committed this? I've been running it for the
> > last release or two without issues, but with significant improvements in
> > performance on the machines involved.
>
> At the price of giving all PCI devices unrestricted access to memory.
>
> So I'm not eager to this, especially since on sun4v hardware bypassing
> the iommu isn't possible as soon as multiple domains are enabled.  And
> we lose a useful diagnostic when developing drivers.  Are you sure the
> iommu overhead can't be reduced some other way?  At some point we
> probably want to add iommu support on amd64 and arm64, but if that
> comes with a similar overhead as on sparc64 that's going to be a bit
> of an issue.
>
> > > Index: dev/iommu.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
> > > retrieving revision 1.74
> > > diff -u -p -r1.74 iommu.c
> > > --- dev/iommu.c     30 Apr 2017 16:45:45 -0000      1.74
> > > +++ dev/iommu.c     10 May 2017 12:00:09 -0000
> > > @@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
> > >  void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
> > >      bus_addr_t, bus_size_t, int);
> > >
> > > +void iommu_hw_enable(struct iommu_state *);
> > > +
> > > +const struct iommu_hw iommu_hw_default = {
> > > +   .ihw_enable     = iommu_hw_enable,
> > > +
> > > +   .ihw_dvma_pa    = IOTTE_PAMASK,
> > > +
> > > +   .ihw_bypass     = 0x3fffUL << 50,
> > > +   .ihw_bypass_nc  = 0,
> > > +   .ihw_bypass_ro  = 0,
> > > +};
> > > +
> > > +void
> > > +iommu_hw_enable(struct iommu_state *is)
> > > +{
> > > +   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> > > +   IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
> > > +}
> > > +
> > >  /*
> > >   * Initiate an STC entry flush.
> > >   */
> > > @@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
> > >   * - create a private DVMA map.
> > >   */
> > >  void
> > > -iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t
> iovabase)
> > > +iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state
> *is,
> > > +    int tsbsize, u_int32_t iovabase)
> > >  {
> > >     psize_t size;
> > >     vaddr_t va;
> > > @@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
> > >      * be hard-wired, so we read the start and size from the PROM and
> > >      * just use those values.
> > >      */
> > > -   if (strncmp(name, "pyro", 4) == 0) {
> > > -           is->is_cr = IOMMUREG_READ(is, iommu_cr);
> > > -           is->is_cr &= ~IOMMUCR_FIRE_BE;
> > > -           is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> > > -               IOMMUCR_FIRE_TE);
> > > -   } else
> > > -           is->is_cr = IOMMUCR_EN;
> > > +
> > > +   is->is_hw = ihw;
> > > +
> > >     is->is_tsbsize = tsbsize;
> > >     if (iovabase == (u_int32_t)-1) {
> > >             is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
> > > @@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
> > >     mtx_init(&is->is_mtx, IPL_HIGH);
> > >
> > >     /*
> > > -    * Set the TSB size.  The relevant bits were moved to the TSB
> > > -    * base register in the PCIe host bridges.
> > > -    */
> > > -   if (strncmp(name, "pyro", 4) == 0)
> > > -           is->is_ptsb |= is->is_tsbsize;
> > > -   else
> > > -           is->is_cr |= (is->is_tsbsize << 16);
> > > -
> > > -   /*
> > >      * Now actually start up the IOMMU.
> > >      */
> > >     iommu_reset(is);
> > > @@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
> > >  {
> > >     int i;
> > >
> > > -   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> > > -
> > > -   /* Enable IOMMU */
> > > -   IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
> > > +   (*is->is_hw->ihw_enable)(is);
> > >
> > >     for (i = 0; i < 2; ++i) {
> > >             struct strbuf_ctl *sb = is->is_sb[i];
> > > @@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
> > >                     printf(", STC%d enabled", i);
> > >     }
> > >
> > > -   if (is->is_flags & IOMMU_FLUSH_CACHE)
> > > +   if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
> > >             IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
> > >  }
> > >
> > > @@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
> > >     if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
> > >             tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
> > >
> > > -   return (tte & IOTTE_PAMASK);
> > > +   return (tte & is->is_hw->ihw_dvma_pa);
> > >  }
> > >
> > >  /*
> > > @@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
> > >  {
> > >     int ret;
> > >     bus_dmamap_t map;
> > > +   struct iommu_state *is = sb->sb_iommu;
> > >     struct iommu_map_state *ims;
> > >
> > >     BUS_DMA_FIND_PARENT(t, _dmamap_create);
> > > @@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
> > >     if (ret)
> > >             return (ret);
> > >
> > > +   if (flags & BUS_DMA_64BIT) {
> > > +           map->_dm_cookie = is;
> > > +           *dmamap = map;
> > > +           return (0);
> > > +   }
> > > +
> > >     ims = iommu_iomap_create(atop(round_page(size)));
> > >
> > >     if (ims == NULL) {
> > > @@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
> > >     if (map->dm_nsegs)
> > >             bus_dmamap_unload(t0, map);
> > >
> > > -        if (map->_dm_cookie)
> > > -                iommu_iomap_destroy(map->_dm_cookie);
> > > +   if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > > +           if (map->_dm_cookie)
> > > +                   iommu_iomap_destroy(map->_dm_cookie);
> > > +   }
> > >     map->_dm_cookie = NULL;
> > >
> > >     BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
> > > @@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
> > >     u_long dvmaddr, sgstart, sgend;
> > >     bus_size_t align, boundary;
> > >     struct iommu_state *is;
> > > -   struct iommu_map_state *ims = map->_dm_cookie;
> > > +   struct iommu_map_state *ims;
> > >     pmap_t pmap;
> > >
> > > -#ifdef DIAGNOSTIC
> > > -   if (ims == NULL)
> > > -           panic("iommu_dvmamap_load: null map state");
> > > -#endif
> > > -#ifdef DEBUG
> > > -   if (ims->ims_sb == NULL)
> > > -           panic("iommu_dvmamap_load: null sb");
> > > -   if (ims->ims_sb->sb_iommu == NULL)
> > > -           panic("iommu_dvmamap_load: null iommu");
> > > -#endif /* DEBUG */
> > > -   is = ims->ims_sb->sb_iommu;
> > > -
> > > -   if (map->dm_nsegs) {
> > > -           /*
> > > -            * Is it still in use? _bus_dmamap_load should have taken
> care
> > > -            * of this.
> > > -            */
> > > -#ifdef DIAGNOSTIC
> > > -           panic("iommu_dvmamap_load: map still in use");
> > > -#endif
> > > -           bus_dmamap_unload(t0, map);
> > > -   }
> > > -
> > >     /*
> > >      * Make sure that on error condition we return "no valid mappings".
> > >      */
> > > -   map->dm_nsegs = 0;
> > > +   KASSERTMSG(map->dm_nsegs == 0, "map still in use");
> > > +
> > > +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > > +           unsigned long bypass;
> > > +           int i;
> > > +
> > > +           is = map->_dm_cookie;
> > > +           bypass = is->is_hw->ihw_bypass;
> > > +
> > > +           /* Bypass translation by the IOMMU. */
> > > +
> > > +           BUS_DMA_FIND_PARENT(t, _dmamap_load);
> > > +           err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p,
> flags);
> > > +           if (err != 0)
> > > +                   return (err);
> > > +
> > > +           for (i = 0; i < map->dm_nsegs; i++)
> > > +                   map->dm_segs[i].ds_addr |= bypass;
> > > +
> > > +           return (0);
> > > +   }
> > > +
> > > +   ims = map->_dm_cookie;
> > > +   is = ims->ims_sb->sb_iommu;
> > >
> > >     if (buflen < 1 || buflen > map->_dm_size) {
> > >             DPRINTF(IDB_BUSDMA,
> > > @@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
> > >     bus_size_t boundary, align;
> > >     u_long dvmaddr, sgstart, sgend;
> > >     struct iommu_state *is;
> > > -   struct iommu_map_state *ims = map->_dm_cookie;
> > > +   struct iommu_map_state *ims;
> > >
> > > -#ifdef DIAGNOSTIC
> > > -   if (ims == NULL)
> > > -           panic("iommu_dvmamap_load_raw: null map state");
> > > -#endif
> > > -#ifdef DEBUG
> > > -   if (ims->ims_sb == NULL)
> > > -           panic("iommu_dvmamap_load_raw: null sb");
> > > -   if (ims->ims_sb->sb_iommu == NULL)
> > > -           panic("iommu_dvmamap_load_raw: null iommu");
> > > -#endif /* DEBUG */
> > > -   is = ims->ims_sb->sb_iommu;
> > > +   KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
> > >
> > > -   if (map->dm_nsegs) {
> > > -           /* Already in use?? */
> > > -#ifdef DIAGNOSTIC
> > > -           panic("iommu_dvmamap_load_raw: map still in use");
> > > -#endif
> > > -           bus_dmamap_unload(t0, map);
> > > +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > > +           unsigned long bypass;
> > > +
> > > +           is = map->_dm_cookie;
> > > +           bypass = is->is_hw->ihw_bypass;
> > > +
> > > +           /* Bypass translation by the IOMMU. */
> > > +           for (i = 0; i < nsegs; i++) {
> > > +                   map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
> > > +                   map->dm_segs[i].ds_len = segs[i].ds_len;
> > > +           }
> > > +
> > > +           map->dm_nsegs = nsegs;
> > > +           map->dm_mapsize = size;
> > > +
> > > +           return (0);
> > >     }
> > >
> > > +   ims = map->_dm_cookie;
> > > +   is = ims->ims_sb->sb_iommu;
> > > +
> > >     /*
> > >      * A boundary presented to bus_dmamem_alloc() takes precedence
> > >      * over boundary in the map.
> > > @@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
> > >     bus_dma_segment_t *seg = NULL;
> > >     int i = map->dm_nsegs;
> > >
> > > -#ifdef DEBUG
> > > -   if (ims == NULL)
> > > -           panic("iommu_dvmamap_append_range: null map state");
> > > -#endif
> > > -
> > >     sgstart = iommu_iomap_translate(ims, pa);
> > >     sgend = sgstart + length - 1;
> > >
> > > @@ -1298,20 +1309,17 @@ void
> > >  iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t
> map)
> > >  {
> > >     struct iommu_state *is;
> > > -   struct iommu_map_state *ims = map->_dm_cookie;
> > > +   struct iommu_map_state *ims;
> > >     bus_addr_t dvmaddr = map->_dm_dvmastart;
> > >     bus_size_t sgsize = map->_dm_dvmasize;
> > >     int error;
> > >
> > > -#ifdef DEBUG
> > > -   if (ims == NULL)
> > > -           panic("iommu_dvmamap_unload: null map state");
> > > -   if (ims->ims_sb == NULL)
> > > -           panic("iommu_dvmamap_unload: null sb");
> > > -   if (ims->ims_sb->sb_iommu == NULL)
> > > -           panic("iommu_dvmamap_unload: null iommu");
> > > -#endif /* DEBUG */
> > > +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> > > +           bus_dmamap_unload(t->_parent, map);
> > > +           return;
> > > +   }
> > >
> > > +   ims = map->_dm_cookie;
> > >     is = ims->ims_sb->sb_iommu;
> > >
> > >     /* Flush the iommu */
> > > @@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
> > >             break;
> > >     }
> > >
> > > -   if (map->_dm_cookie) {
> > > +   if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie !=
> NULL) {
> > >             struct iommu_map_state *ims = map->_dm_cookie;
> > >             struct iommu_page_map *ipm = &ims->ims_map;
> > >
> > > @@ -1546,19 +1554,19 @@ void
> > >  iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t
> map,
> > >      bus_addr_t offset, bus_size_t len, int ops)
> > >  {
> > > -   struct iommu_map_state *ims = map->_dm_cookie;
> > > +   struct iommu_map_state *ims;
> > >
> > > -#ifdef DIAGNOSTIC
> > > -   if (ims == NULL)
> > > -           panic("iommu_dvmamap_sync: null map state");
> > > -   if (ims->ims_sb == NULL)
> > > -           panic("iommu_dvmamap_sync: null sb");
> > > -   if (ims->ims_sb->sb_iommu == NULL)
> > > -           panic("iommu_dvmamap_sync: null iommu");
> > > -#endif
> > >     if (len == 0)
> > >             return;
> > >
> > > +   if (map->_dm_flags & BUS_DMA_64BIT) {
> > > +           if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
> > > +                   membar(MemIssue);
> > > +           return;
> > > +   }
> > > +
> > > +   ims = map->_dm_cookie;
> > > +
> > >     if (ops & BUS_DMASYNC_PREWRITE)
> > >             membar(MemIssue);
> > >
> > > @@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
> > >         "bound %llx segp %p flags %d\n", (unsigned long long)size,
> > >         (unsigned long long)alignment, (unsigned long long)boundary,
> > >         segs, flags));
> > > +
> > > +   if ((flags & BUS_DMA_64BIT) == 0)
> > > +           flags |= BUS_DMA_DVMA;
> > > +
> > >     BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
> > >     return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
> > > -       segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
> > > +       segs, nsegs, rsegs, flags));
> > >  }
> > >
> > >  void
> > > @@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
> > >
> > >             /* Flush cache if necessary. */
> > >             slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> > > -           if (is->is_flags & IOMMU_FLUSH_CACHE &&
> > > +           if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
> > >                 (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
> > >                     IOMMUREG_WRITE(is, iommu_cache_flush,
> > >                         is->is_ptsb + slot * 8);
> > > @@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
> > >
> > >             /* Flush cache if necessary. */
> > >             slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> > > -           if (is->is_flags & IOMMU_FLUSH_CACHE &&
> > > +           if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
> > >                 (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
> > >                     IOMMUREG_WRITE(is, iommu_cache_flush,
> > >                         is->is_ptsb + slot * 8);
> > > Index: dev/iommureg.h
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
> > > retrieving revision 1.17
> > > diff -u -p -r1.17 iommureg.h
> > > --- dev/iommureg.h  17 Aug 2012 20:46:50 -0000      1.17
> > > +++ dev/iommureg.h  10 May 2017 12:00:09 -0000
> > > @@ -90,10 +90,11 @@ struct iommu_strbuf {
> > >  #define IOMMUCR_DE         0x000000000000000002LL  /* Diag enable */
> > >  #define IOMMUCR_EN         0x000000000000000001LL  /* Enable IOMMU */
> > >
> > > -#define IOMMUCR_FIRE_SE            0x000000000000000400LL  /* Snoop
> enable */
> > > -#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode
> enable */
> > > -#define IOMMUCR_FIRE_BE            0x000000000000000002LL  /* Bypass
> enable */
> > > -#define IOMMUCR_FIRE_TE            0x000000000000000001LL  /*
> Translation enabled */
> > > +#define IOMMUCR_FIRE_PD            0x000000000000001000UL  /* Process
> disable */
> > > +#define IOMMUCR_FIRE_SE            0x000000000000000400UL  /* Snoop
> enable */
> > > +#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode
> enable */
> > > +#define IOMMUCR_FIRE_BE            0x000000000000000002UL  /* Bypass
> enable */
> > > +#define IOMMUCR_FIRE_TE            0x000000000000000001UL  /*
> Translation enabled */
> > >
> > >  /*
> > >   * IOMMU stuff
> > > Index: dev/iommuvar.h
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
> > > retrieving revision 1.17
> > > diff -u -p -r1.17 iommuvar.h
> > > --- dev/iommuvar.h  4 May 2016 18:26:12 -0000       1.17
> > > +++ dev/iommuvar.h  10 May 2017 12:00:09 -0000
> > > @@ -100,6 +100,21 @@ struct iommu_map_state {
> > >  };
> > >  #define IOMMU_MAP_STREAM   1
> > >
> > > +struct iommu_hw {
> > > +   void                    (*ihw_enable)(struct iommu_state *);
> > > +
> > > +   unsigned long           ihw_dvma_pa;
> > > +
> > > +   unsigned long           ihw_bypass;
> > > +   unsigned long           ihw_bypass_nc;          /* non-cached */
> > > +   unsigned long           ihw_bypass_ro;          /* relaxed
> ordering */
> > > +
> > > +   unsigned int            ihw_flags;
> > > +#define IOMMU_HW_FLUSH_CACHE               (1 << 0)
> > > +};
> > > +
> > > +extern const struct iommu_hw iommu_hw_default;
> > > +
> > >  /*
> > >   * per-IOMMU state
> > >   */
> > > @@ -112,8 +127,7 @@ struct iommu_state {
> > >     int64_t                 is_cr;          /* Control register value
> */
> > >     struct mutex            is_mtx;
> > >     struct extent           *is_dvmamap;    /* DVMA map for this
> instance */
> > > -   int                     is_flags;
> > > -#define IOMMU_FLUSH_CACHE  0x00000001
> > > +   const struct iommu_hw   *is_hw;
> > >
> > >     struct strbuf_ctl       *is_sb[2];      /* Streaming buffers if
> any */
> > >
> > > @@ -126,7 +140,8 @@ struct iommu_state {
> > >  };
> > >
> > >  /* interfaces for PCI/SBus code */
> > > -void       iommu_init(char *, struct iommu_state *, int, u_int32_t);
> > > +void       iommu_init(char *, const struct iommu_hw *, struct
> iommu_state *,
> > > +    int, u_int32_t);
> > >  void       iommu_reset(struct iommu_state *);
> > >  paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
> > >  int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
> > > @@ -146,6 +161,7 @@ int     iommu_dvmamem_alloc(bus_dma_tag_t, b
> > >         bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
> > >  void       iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t,
> bus_dma_segment_t *,
> > >         int);
> > > +
> > >
> > >  #define IOMMUREG_READ(is, reg)                             \
> > >     bus_space_read_8((is)->is_bustag,               \
> > > Index: dev/pci_machdep.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
> > > retrieving revision 1.44
> > > diff -u -p -r1.44 pci_machdep.c
> > > --- dev/pci_machdep.c       10 May 2014 12:15:19 -0000      1.44
> > > +++ dev/pci_machdep.c       10 May 2017 12:00:09 -0000
> > > @@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
> > >  #include <machine/openfirm.h>
> > >  #include <dev/pci/pcivar.h>
> > >  #include <dev/pci/pcireg.h>
> > > +#include <dev/pci/pcidevs.h>
> > >
> > >  #include <dev/ofw/ofw_pci.h>
> > >
> > > @@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
> > >     struct pcibus_attach_args *pba;
> > >  {
> > >     /* Don't do anything */
> > > +}
> > > +
> > > +int
> > > +pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t
> size,
> > > +    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int
> flags,
> > > +    bus_dmamap_t *dmamp)
> > > +{
> > > +   bus_dma_tag_t pdt = dt->_parent;
> > > +
> > > +   CLR(flags, BUS_DMA_64BIT);
> > > +
> > > +   return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
> > > +       boundary, flags, dmamp));
> > > +}
> > > +
> > > +int
> > > +pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args
> *pa)
> > > +{
> > > +   bus_dma_tag_t dt, pdt;
> > > +
> > > +   if (pa->pa_id ==
> > > +       PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
> > > +           /*
> > > +            * These PCI bridges only support 40bit DVA, so intercept
> > > +            * bus_dmamap_create so we can clear BUS_DMA_64BIT.
> > > +            */
> > > +
> > > +           dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
> > > +           if (dt == NULL)
> > > +                   panic("%s: could not alloc dma tag", __func__);
> > > +
> > > +           pdt = pa->pa_dmat;
> > > +
> > > +           dt->_parent = pdt;
> > > +           dt->_dmamap_create = pci_bcm_dmamap_create;
> > > +
> > > +           pa->pa_dmat = dt;
> > > +   }
> > > +
> > > +   return (0);
> > >  }
> > >
> > >  int
> > > Index: dev/psycho.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
> > > retrieving revision 1.74
> > > diff -u -p -r1.74 psycho.c
> > > --- dev/psycho.c    23 Aug 2016 03:28:01 -0000      1.74
> > > +++ dev/psycho.c    10 May 2017 12:00:09 -0000
> > > @@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
> > >             panic("couldn't malloc iommu name");
> > >     snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
> > >
> > > -   iommu_init(name, is, tsbsize, iobase);
> > > +   iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
> > >  }
> > >
> > >  /*
> > > Index: dev/pyro.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
> > > retrieving revision 1.30
> > > diff -u -p -r1.30 pyro.c
> > > --- dev/pyro.c      20 Dec 2016 13:40:50 -0000      1.30
> > > +++ dev/pyro.c      10 May 2017 12:00:09 -0000
> > > @@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
> > >  int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
> > >      bus_size_t, bus_size_t, int, bus_dmamap_t *);
> > >
> > > +void pyro_iommu_enable(struct iommu_state *);
> > > +
> > > +const struct iommu_hw iommu_hw_fire = {
> > > +   .ihw_enable     = pyro_iommu_enable,
> > > +
> > > +   .ihw_dvma_pa    = 0x000007ffffffffffUL,
> > > +
> > > +   .ihw_bypass     = 0xfffc000000000000UL,
> > > +   .ihw_bypass_nc  = 0x0000080000000000UL,
> > > +   .ihw_bypass_ro  = 0,
> > > +};
> > > +
> > > +const struct iommu_hw iommu_hw_oberon = {
> > > +   .ihw_enable     = pyro_iommu_enable,
> > > +
> > > +   .ihw_dvma_pa    = 0x00007fffffffffffUL,
> > > +
> > > +   .ihw_bypass     = 0x7ffc000000000000UL,
> > > +   .ihw_bypass_nc  = 0x0000800000000000UL,
> > > +   .ihw_bypass_ro  = 0x8000000000000000UL,
> > > +
> > > +   .ihw_flags      = IOMMU_HW_FLUSH_CACHE,
> > > +};
> > > +
> > >  #ifdef DDB
> > >  void pyro_xir(void *, int);
> > >  #endif
> > > @@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
> > >     int tsbsize = 7;
> > >     u_int32_t iobase = -1;
> > >     char *name;
> > > +   const struct iommu_hw *ihw = &iommu_hw_fire;
> > >
> > >     is->is_bustag = sc->sc_bust;
> > >
> > > @@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
> > >             panic("couldn't malloc iommu name");
> > >     snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
> > >
> > > -   /* On Oberon, we need to flush the cache. */
> > >     if (sc->sc_oberon)
> > > -           is->is_flags |= IOMMU_FLUSH_CACHE;
> > > +           ihw = &iommu_hw_oberon;
> > > +
> > > +   iommu_init(name, ihw, is, tsbsize, iobase);
> > > +}
> > > +
> > > +void
> > > +pyro_iommu_enable(struct iommu_state *is)
> > > +{
> > > +   unsigned long cr;
> > > +
> > > +   cr = IOMMUREG_READ(is, iommu_cr);
> > > +   cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> > > +       IOMMUCR_FIRE_TE;
> > >
> > > -   iommu_init(name, is, tsbsize, iobase);
> > > +   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
> > > +   IOMMUREG_WRITE(is, iommu_cr, cr);
> > >  }
> > >
> > >  void
> > > Index: dev/sbus.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
> > > retrieving revision 1.44
> > > diff -u -p -r1.44 sbus.c
> > > --- dev/sbus.c      19 Sep 2015 21:07:04 -0000      1.44
> > > +++ dev/sbus.c      10 May 2017 12:00:09 -0000
> > > @@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
> > >     snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
> > >
> > >     printf("%s: ", sc->sc_dev.dv_xname);
> > > -   iommu_init(name, &sc->sc_is, 0, -1);
> > > +   iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
> > >
> > >     /* Initialize Starfire PC interrupt translation. */
> > >     if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
> > > Index: dev/schizo.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
> > > retrieving revision 1.67
> > > diff -u -p -r1.67 schizo.c
> > > --- dev/schizo.c    23 Aug 2016 03:28:01 -0000      1.67
> > > +++ dev/schizo.c    10 May 2017 12:00:09 -0000
> > > @@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
> > >                 "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
> > >     }
> > >
> > > -   iommu_init(name, is, tsbsize, iobase);
> > > +   iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
> > >  }
> > >
> > >  int
> > > Index: include/pci_machdep.h
> > > ===================================================================
> > > RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
> > > retrieving revision 1.33
> > > diff -u -p -r1.33 pci_machdep.h
> > > --- include/pci_machdep.h   4 May 2016 14:30:01 -0000       1.33
> > > +++ include/pci_machdep.h   10 May 2017 12:00:09 -0000
> > > @@ -74,10 +74,13 @@ struct sparc_pci_chipset {
> > >     pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
> > >     void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
> > >     int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
> > > +   int (*probe_device_hook)(void *, struct pci_attach_args *);
> > >  };
> > >
> > >  void               pci_attach_hook(struct device *, struct device *,
> > >                                  struct pcibus_attach_args *);
> > > +int                pci_probe_device_hook(pci_chipset_tag_t,
> > > +               struct pci_attach_args *);
> > >  int                pci_bus_maxdevs(pci_chipset_tag_t, int);
> > >  pcitag_t   pci_make_tag(pci_chipset_tag_t, int, int, int);
> > >  void               pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int
> *, int *,
> > > @@ -102,8 +105,6 @@ int             sparc64_pci_enumerate_bus(struct pc
> > >                 struct pci_attach_args *);
> > >
> > >  #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
> > > -
> > > -#define    pci_probe_device_hook(c, a)     (0)
> > >
> > >  #define    pci_min_powerstate(c, t)        (PCI_PMCSR_STATE_D3)
> > >  #define    pci_set_powerstate_md(c, t, s, p)
> >
> >
>
>
Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

David Gwynne-5
In reply to this post by Mark Kettenis


> On 19 Oct 2018, at 7:15 pm, Mark Kettenis <[hidden email]> wrote:
>
>> Date: Fri, 19 Oct 2018 10:22:30 +1000
>> From: David Gwynne <[hidden email]>
>>
>> On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
>>> On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
>>>> on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
>>>> setting up and tearing down the translation table entries (TTEs)
>>>> is very expensive. so expensive that the cost of doing it for disk
>>>> io has a noticable impact on compile times.
>>>>
>>>> now that there's a BUS_DMA_64BIT flag, we can use that to decide
>>>> to bypass the iommu for devices that set that flag, therefore
>>>> avoiding the cost of handling the TTEs.
>>>>
>>>> the following diff adds support for bypass mappings to the iommu
>>>> code on sparc64. it's based on a diff from kettenis@ back in 2009.
>>>> the main changes are around coping with the differences between
>>>> schizo/psycho and fire/oberon.
>>>>
>>>> the differences between the chips are now represented by a iommu_hw
>>>> struct. these differences include how to enable the iommu (now via
>>>> a function pointer), and masks for bypass addresses.
>>>>
>>>> ive tested this on oberon (on an m4000) and schizo (on a v880).
>>>> however, the bypass code isnt working on fire (v245s). to cope with
>>>> that for now, the iommu_hw struct lets drivers mask flag bits that
>>>> are handled when creating a dmamap. this means fire boards will
>>>> ignore BUS_DMA_64BIT until i can figure out whats wrong with them.
>>>
>>> i figured it out. it turns out Fire was working fine. however,
>>> enabling 64bit dva on the onboard devices didnt work because the
>>> serverworks/broadcom pcie to pcix bridge can only handle dma addresses
>>> in the low 40 bits. because the fire bypass window is higher than
>>> this, the bridge would choke and things stopped working.
>>>
>>> the updated diff attempts to handle this. basically when probing
>>> the bridge, the platform creates a custom dma tag for it. this tag
>>> intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
>>> handing it up to the parent bridge, which is pyro in my situation.
>>> it looks like early sun4v boxes could make use of this too.
>>>
>>>> i have not tested this on psycho yet. if anyone has such a machine
>>>> and is willing to work with me to figure it out, please talk to me.
>>>
>>> i still dont have psycho reports.
>>
>> Would anyone object if I committed this? I've been running it for the
>> last release or two without issues, but with significant improvements in
>> performance on the machines involved.
>
> At the price of giving all PCI devices unrestricted access to memory.
>
> So I'm not eager to this, especially since on sun4v hardware bypassing
> the iommu isn't possible as soon as multiple domains are enabled.  And
> we lose a useful diagnostic when developing drivers.  Are you sure the
> iommu overhead can't be reduced some other way?  At some point we
> probably want to add iommu support on amd64 and arm64, but if that
> comes with a similar overhead as on sparc64 that's going to be a bit
> of an issue.

First, note that it doesn't turn the iommu off. By default drivers still go through it unless they opt out with BUS_DMA_64BIT. This is because the iommu is still in between the device and ram, and it provides the passthru window up at 0xfffc000000000000.

As an aside, and as hinted at in my previous mails, it means that devices with ppb6 at pci6 dev 0 function 0 "ServerWorks PCIE-PCIX" rev 0xb5 in them cannot really use BUS_DMA_64BIT cos those bridges are buggy and don't handle DVAs above 48 or 56 bits or something. That bridge is used in v215s, v245s, v445s, t1000s, and so on.

I have a theory that because of that bridge, there was a meme going around Sun at the time that it was cheaper to memcpy in and out of preallocated DMA memory than it was to do DMA for every packet or disk I/O or whatever.

Which leads me to the conclusion that an alternative to using the passthru window would be to have bus_dma preallocate the dmaable memory and bounce in and out of it. The performance hit I'm trying to avoid is with setting up and tearing down the transaction table entries. If they already exist, you avoid that hit.

Bouncing is complicated though, both in the bus_dma layer, and especially by pushing it into drivers.

The amount of overhead varies between machines. It seems less of a difference with nvme(4) in a slot that is not behind the dodgy bridge on a v245. It was about 20 or 30 percent of a difference with gem(4) and tcpbench in a v880 (schizo). It is particularly bad on the M4000 I have, this is why I looked into this. There are orders of magnitude of difference between tcpbench results with a tweaked ix(4) and this diff on or off. We've not enabled mitigations before because of performance hits less than this.

dlg

>
>>> Index: dev/iommu.c
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
>>> retrieving revision 1.74
>>> diff -u -p -r1.74 iommu.c
>>> --- dev/iommu.c 30 Apr 2017 16:45:45 -0000 1.74
>>> +++ dev/iommu.c 10 May 2017 12:00:09 -0000
>>> @@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
>>> void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
>>>     bus_addr_t, bus_size_t, int);
>>>
>>> +void iommu_hw_enable(struct iommu_state *);
>>> +
>>> +const struct iommu_hw iommu_hw_default = {
>>> + .ihw_enable = iommu_hw_enable,
>>> +
>>> + .ihw_dvma_pa = IOTTE_PAMASK,
>>> +
>>> + .ihw_bypass = 0x3fffUL << 50,
>>> + .ihw_bypass_nc = 0,
>>> + .ihw_bypass_ro = 0,
>>> +};
>>> +
>>> +void
>>> +iommu_hw_enable(struct iommu_state *is)
>>> +{
>>> + IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
>>> + IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
>>> +}
>>> +
>>> /*
>>>  * Initiate an STC entry flush.
>>>  */
>>> @@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
>>>  * - create a private DVMA map.
>>>  */
>>> void
>>> -iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t iovabase)
>>> +iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state *is,
>>> +    int tsbsize, u_int32_t iovabase)
>>> {
>>> psize_t size;
>>> vaddr_t va;
>>> @@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
>>> * be hard-wired, so we read the start and size from the PROM and
>>> * just use those values.
>>> */
>>> - if (strncmp(name, "pyro", 4) == 0) {
>>> - is->is_cr = IOMMUREG_READ(is, iommu_cr);
>>> - is->is_cr &= ~IOMMUCR_FIRE_BE;
>>> - is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
>>> -    IOMMUCR_FIRE_TE);
>>> - } else
>>> - is->is_cr = IOMMUCR_EN;
>>> +
>>> + is->is_hw = ihw;
>>> +
>>> is->is_tsbsize = tsbsize;
>>> if (iovabase == (u_int32_t)-1) {
>>> is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
>>> @@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
>>> mtx_init(&is->is_mtx, IPL_HIGH);
>>>
>>> /*
>>> - * Set the TSB size.  The relevant bits were moved to the TSB
>>> - * base register in the PCIe host bridges.
>>> - */
>>> - if (strncmp(name, "pyro", 4) == 0)
>>> - is->is_ptsb |= is->is_tsbsize;
>>> - else
>>> - is->is_cr |= (is->is_tsbsize << 16);
>>> -
>>> - /*
>>> * Now actually start up the IOMMU.
>>> */
>>> iommu_reset(is);
>>> @@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
>>> {
>>> int i;
>>>
>>> - IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
>>> -
>>> - /* Enable IOMMU */
>>> - IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
>>> + (*is->is_hw->ihw_enable)(is);
>>>
>>> for (i = 0; i < 2; ++i) {
>>> struct strbuf_ctl *sb = is->is_sb[i];
>>> @@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
>>> printf(", STC%d enabled", i);
>>> }
>>>
>>> - if (is->is_flags & IOMMU_FLUSH_CACHE)
>>> + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
>>> IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
>>> }
>>>
>>> @@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
>>> if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
>>> tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
>>>
>>> - return (tte & IOTTE_PAMASK);
>>> + return (tte & is->is_hw->ihw_dvma_pa);
>>> }
>>>
>>> /*
>>> @@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
>>> {
>>> int ret;
>>> bus_dmamap_t map;
>>> + struct iommu_state *is = sb->sb_iommu;
>>> struct iommu_map_state *ims;
>>>
>>> BUS_DMA_FIND_PARENT(t, _dmamap_create);
>>> @@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
>>> if (ret)
>>> return (ret);
>>>
>>> + if (flags & BUS_DMA_64BIT) {
>>> + map->_dm_cookie = is;
>>> + *dmamap = map;
>>> + return (0);
>>> + }
>>> +
>>> ims = iommu_iomap_create(atop(round_page(size)));
>>>
>>> if (ims == NULL) {
>>> @@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
>>> if (map->dm_nsegs)
>>> bus_dmamap_unload(t0, map);
>>>
>>> -        if (map->_dm_cookie)
>>> -                iommu_iomap_destroy(map->_dm_cookie);
>>> + if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>> +        if (map->_dm_cookie)
>>> + iommu_iomap_destroy(map->_dm_cookie);
>>> + }
>>> map->_dm_cookie = NULL;
>>>
>>> BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
>>> @@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
>>> u_long dvmaddr, sgstart, sgend;
>>> bus_size_t align, boundary;
>>> struct iommu_state *is;
>>> - struct iommu_map_state *ims = map->_dm_cookie;
>>> + struct iommu_map_state *ims;
>>> pmap_t pmap;
>>>
>>> -#ifdef DIAGNOSTIC
>>> - if (ims == NULL)
>>> - panic("iommu_dvmamap_load: null map state");
>>> -#endif
>>> -#ifdef DEBUG
>>> - if (ims->ims_sb == NULL)
>>> - panic("iommu_dvmamap_load: null sb");
>>> - if (ims->ims_sb->sb_iommu == NULL)
>>> - panic("iommu_dvmamap_load: null iommu");
>>> -#endif /* DEBUG */
>>> - is = ims->ims_sb->sb_iommu;
>>> -
>>> - if (map->dm_nsegs) {
>>> - /*
>>> - * Is it still in use? _bus_dmamap_load should have taken care
>>> - * of this.
>>> - */
>>> -#ifdef DIAGNOSTIC
>>> - panic("iommu_dvmamap_load: map still in use");
>>> -#endif
>>> - bus_dmamap_unload(t0, map);
>>> - }
>>> -
>>> /*
>>> * Make sure that on error condition we return "no valid mappings".
>>> */
>>> - map->dm_nsegs = 0;
>>> + KASSERTMSG(map->dm_nsegs == 0, "map still in use");
>>> +
>>> + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>> + unsigned long bypass;
>>> + int i;
>>> +
>>> + is = map->_dm_cookie;
>>> + bypass = is->is_hw->ihw_bypass;
>>> +
>>> + /* Bypass translation by the IOMMU. */
>>> +
>>> + BUS_DMA_FIND_PARENT(t, _dmamap_load);
>>> + err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p, flags);
>>> + if (err != 0)
>>> + return (err);
>>> +
>>> + for (i = 0; i < map->dm_nsegs; i++)
>>> + map->dm_segs[i].ds_addr |= bypass;
>>> +
>>> + return (0);
>>> + }
>>> +
>>> + ims = map->_dm_cookie;
>>> + is = ims->ims_sb->sb_iommu;
>>>
>>> if (buflen < 1 || buflen > map->_dm_size) {
>>> DPRINTF(IDB_BUSDMA,
>>> @@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
>>> bus_size_t boundary, align;
>>> u_long dvmaddr, sgstart, sgend;
>>> struct iommu_state *is;
>>> - struct iommu_map_state *ims = map->_dm_cookie;
>>> + struct iommu_map_state *ims;
>>>
>>> -#ifdef DIAGNOSTIC
>>> - if (ims == NULL)
>>> - panic("iommu_dvmamap_load_raw: null map state");
>>> -#endif
>>> -#ifdef DEBUG
>>> - if (ims->ims_sb == NULL)
>>> - panic("iommu_dvmamap_load_raw: null sb");
>>> - if (ims->ims_sb->sb_iommu == NULL)
>>> - panic("iommu_dvmamap_load_raw: null iommu");
>>> -#endif /* DEBUG */
>>> - is = ims->ims_sb->sb_iommu;
>>> + KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
>>>
>>> - if (map->dm_nsegs) {
>>> - /* Already in use?? */
>>> -#ifdef DIAGNOSTIC
>>> - panic("iommu_dvmamap_load_raw: map still in use");
>>> -#endif
>>> - bus_dmamap_unload(t0, map);
>>> + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>> + unsigned long bypass;
>>> +
>>> + is = map->_dm_cookie;
>>> + bypass = is->is_hw->ihw_bypass;
>>> +
>>> + /* Bypass translation by the IOMMU. */
>>> + for (i = 0; i < nsegs; i++) {
>>> + map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
>>> + map->dm_segs[i].ds_len = segs[i].ds_len;
>>> + }
>>> +
>>> + map->dm_nsegs = nsegs;
>>> + map->dm_mapsize = size;
>>> +
>>> + return (0);
>>> }
>>>
>>> + ims = map->_dm_cookie;
>>> + is = ims->ims_sb->sb_iommu;
>>> +
>>> /*
>>> * A boundary presented to bus_dmamem_alloc() takes precedence
>>> * over boundary in the map.
>>> @@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
>>> bus_dma_segment_t *seg = NULL;
>>> int i = map->dm_nsegs;
>>>
>>> -#ifdef DEBUG
>>> - if (ims == NULL)
>>> - panic("iommu_dvmamap_append_range: null map state");
>>> -#endif
>>> -
>>> sgstart = iommu_iomap_translate(ims, pa);
>>> sgend = sgstart + length - 1;
>>>
>>> @@ -1298,20 +1309,17 @@ void
>>> iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map)
>>> {
>>> struct iommu_state *is;
>>> - struct iommu_map_state *ims = map->_dm_cookie;
>>> + struct iommu_map_state *ims;
>>> bus_addr_t dvmaddr = map->_dm_dvmastart;
>>> bus_size_t sgsize = map->_dm_dvmasize;
>>> int error;
>>>
>>> -#ifdef DEBUG
>>> - if (ims == NULL)
>>> - panic("iommu_dvmamap_unload: null map state");
>>> - if (ims->ims_sb == NULL)
>>> - panic("iommu_dvmamap_unload: null sb");
>>> - if (ims->ims_sb->sb_iommu == NULL)
>>> - panic("iommu_dvmamap_unload: null iommu");
>>> -#endif /* DEBUG */
>>> + if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>> + bus_dmamap_unload(t->_parent, map);
>>> + return;
>>> + }
>>>
>>> + ims = map->_dm_cookie;
>>> is = ims->ims_sb->sb_iommu;
>>>
>>> /* Flush the iommu */
>>> @@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
>>> break;
>>> }
>>>
>>> - if (map->_dm_cookie) {
>>> + if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie != NULL) {
>>> struct iommu_map_state *ims = map->_dm_cookie;
>>> struct iommu_page_map *ipm = &ims->ims_map;
>>>
>>> @@ -1546,19 +1554,19 @@ void
>>> iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t map,
>>>     bus_addr_t offset, bus_size_t len, int ops)
>>> {
>>> - struct iommu_map_state *ims = map->_dm_cookie;
>>> + struct iommu_map_state *ims;
>>>
>>> -#ifdef DIAGNOSTIC
>>> - if (ims == NULL)
>>> - panic("iommu_dvmamap_sync: null map state");
>>> - if (ims->ims_sb == NULL)
>>> - panic("iommu_dvmamap_sync: null sb");
>>> - if (ims->ims_sb->sb_iommu == NULL)
>>> - panic("iommu_dvmamap_sync: null iommu");
>>> -#endif
>>> if (len == 0)
>>> return;
>>>
>>> + if (map->_dm_flags & BUS_DMA_64BIT) {
>>> + if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
>>> + membar(MemIssue);
>>> + return;
>>> + }
>>> +
>>> + ims = map->_dm_cookie;
>>> +
>>> if (ops & BUS_DMASYNC_PREWRITE)
>>> membar(MemIssue);
>>>
>>> @@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
>>>    "bound %llx segp %p flags %d\n", (unsigned long long)size,
>>>    (unsigned long long)alignment, (unsigned long long)boundary,
>>>    segs, flags));
>>> +
>>> + if ((flags & BUS_DMA_64BIT) == 0)
>>> + flags |= BUS_DMA_DVMA;
>>> +
>>> BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
>>> return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
>>> -    segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
>>> +    segs, nsegs, rsegs, flags));
>>> }
>>>
>>> void
>>> @@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
>>>
>>> /* Flush cache if necessary. */
>>> slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
>>> - if (is->is_flags & IOMMU_FLUSH_CACHE &&
>>> + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
>>>    (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
>>> IOMMUREG_WRITE(is, iommu_cache_flush,
>>>    is->is_ptsb + slot * 8);
>>> @@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
>>>
>>> /* Flush cache if necessary. */
>>> slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
>>> - if (is->is_flags & IOMMU_FLUSH_CACHE &&
>>> + if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
>>>    (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
>>> IOMMUREG_WRITE(is, iommu_cache_flush,
>>>    is->is_ptsb + slot * 8);
>>> Index: dev/iommureg.h
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
>>> retrieving revision 1.17
>>> diff -u -p -r1.17 iommureg.h
>>> --- dev/iommureg.h 17 Aug 2012 20:46:50 -0000 1.17
>>> +++ dev/iommureg.h 10 May 2017 12:00:09 -0000
>>> @@ -90,10 +90,11 @@ struct iommu_strbuf {
>>> #define IOMMUCR_DE 0x000000000000000002LL /* Diag enable */
>>> #define IOMMUCR_EN 0x000000000000000001LL /* Enable IOMMU */
>>>
>>> -#define IOMMUCR_FIRE_SE 0x000000000000000400LL /* Snoop enable */
>>> -#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode enable */
>>> -#define IOMMUCR_FIRE_BE 0x000000000000000002LL /* Bypass enable */
>>> -#define IOMMUCR_FIRE_TE 0x000000000000000001LL /* Translation enabled */
>>> +#define IOMMUCR_FIRE_PD 0x000000000000001000UL /* Process disable */
>>> +#define IOMMUCR_FIRE_SE 0x000000000000000400UL /* Snoop enable */
>>> +#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode enable */
>>> +#define IOMMUCR_FIRE_BE 0x000000000000000002UL /* Bypass enable */
>>> +#define IOMMUCR_FIRE_TE 0x000000000000000001UL /* Translation enabled */
>>>
>>> /*
>>>  * IOMMU stuff
>>> Index: dev/iommuvar.h
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
>>> retrieving revision 1.17
>>> diff -u -p -r1.17 iommuvar.h
>>> --- dev/iommuvar.h 4 May 2016 18:26:12 -0000 1.17
>>> +++ dev/iommuvar.h 10 May 2017 12:00:09 -0000
>>> @@ -100,6 +100,21 @@ struct iommu_map_state {
>>> };
>>> #define IOMMU_MAP_STREAM 1
>>>
>>> +struct iommu_hw {
>>> + void (*ihw_enable)(struct iommu_state *);
>>> +
>>> + unsigned long ihw_dvma_pa;
>>> +
>>> + unsigned long ihw_bypass;
>>> + unsigned long ihw_bypass_nc; /* non-cached */
>>> + unsigned long ihw_bypass_ro; /* relaxed ordering */
>>> +
>>> + unsigned int ihw_flags;
>>> +#define IOMMU_HW_FLUSH_CACHE (1 << 0)
>>> +};
>>> +
>>> +extern const struct iommu_hw iommu_hw_default;
>>> +
>>> /*
>>>  * per-IOMMU state
>>>  */
>>> @@ -112,8 +127,7 @@ struct iommu_state {
>>> int64_t is_cr; /* Control register value */
>>> struct mutex is_mtx;
>>> struct extent *is_dvmamap; /* DVMA map for this instance */
>>> - int is_flags;
>>> -#define IOMMU_FLUSH_CACHE 0x00000001
>>> + const struct iommu_hw *is_hw;
>>>
>>> struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */
>>>
>>> @@ -126,7 +140,8 @@ struct iommu_state {
>>> };
>>>
>>> /* interfaces for PCI/SBus code */
>>> -void iommu_init(char *, struct iommu_state *, int, u_int32_t);
>>> +void iommu_init(char *, const struct iommu_hw *, struct iommu_state *,
>>> +    int, u_int32_t);
>>> void iommu_reset(struct iommu_state *);
>>> paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
>>> int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
>>> @@ -146,6 +161,7 @@ int iommu_dvmamem_alloc(bus_dma_tag_t, b
>>>    bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
>>> void iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t, bus_dma_segment_t *,
>>>    int);
>>> +
>>>
>>> #define IOMMUREG_READ(is, reg) \
>>> bus_space_read_8((is)->is_bustag, \
>>> Index: dev/pci_machdep.c
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
>>> retrieving revision 1.44
>>> diff -u -p -r1.44 pci_machdep.c
>>> --- dev/pci_machdep.c 10 May 2014 12:15:19 -0000 1.44
>>> +++ dev/pci_machdep.c 10 May 2017 12:00:09 -0000
>>> @@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
>>> #include <machine/openfirm.h>
>>> #include <dev/pci/pcivar.h>
>>> #include <dev/pci/pcireg.h>
>>> +#include <dev/pci/pcidevs.h>
>>>
>>> #include <dev/ofw/ofw_pci.h>
>>>
>>> @@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
>>> struct pcibus_attach_args *pba;
>>> {
>>> /* Don't do anything */
>>> +}
>>> +
>>> +int
>>> +pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t size,
>>> +    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags,
>>> +    bus_dmamap_t *dmamp)
>>> +{
>>> + bus_dma_tag_t pdt = dt->_parent;
>>> +
>>> + CLR(flags, BUS_DMA_64BIT);
>>> +
>>> + return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
>>> +    boundary, flags, dmamp));
>>> +}
>>> +
>>> +int
>>> +pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args *pa)
>>> +{
>>> + bus_dma_tag_t dt, pdt;
>>> +
>>> + if (pa->pa_id ==
>>> +    PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
>>> + /*
>>> + * These PCI bridges only support 40bit DVA, so intercept
>>> + * bus_dmamap_create so we can clear BUS_DMA_64BIT.
>>> + */
>>> +
>>> + dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
>>> + if (dt == NULL)
>>> + panic("%s: could not alloc dma tag", __func__);
>>> +
>>> + pdt = pa->pa_dmat;
>>> +
>>> + dt->_parent = pdt;
>>> + dt->_dmamap_create = pci_bcm_dmamap_create;
>>> +
>>> + pa->pa_dmat = dt;
>>> + }
>>> +
>>> + return (0);
>>> }
>>>
>>> int
>>> Index: dev/psycho.c
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
>>> retrieving revision 1.74
>>> diff -u -p -r1.74 psycho.c
>>> --- dev/psycho.c 23 Aug 2016 03:28:01 -0000 1.74
>>> +++ dev/psycho.c 10 May 2017 12:00:09 -0000
>>> @@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
>>> panic("couldn't malloc iommu name");
>>> snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
>>>
>>> - iommu_init(name, is, tsbsize, iobase);
>>> + iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
>>> }
>>>
>>> /*
>>> Index: dev/pyro.c
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
>>> retrieving revision 1.30
>>> diff -u -p -r1.30 pyro.c
>>> --- dev/pyro.c 20 Dec 2016 13:40:50 -0000 1.30
>>> +++ dev/pyro.c 10 May 2017 12:00:09 -0000
>>> @@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
>>> int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
>>>     bus_size_t, bus_size_t, int, bus_dmamap_t *);
>>>
>>> +void pyro_iommu_enable(struct iommu_state *);
>>> +
>>> +const struct iommu_hw iommu_hw_fire = {
>>> + .ihw_enable = pyro_iommu_enable,
>>> +
>>> + .ihw_dvma_pa = 0x000007ffffffffffUL,
>>> +
>>> + .ihw_bypass = 0xfffc000000000000UL,
>>> + .ihw_bypass_nc = 0x0000080000000000UL,
>>> + .ihw_bypass_ro = 0,
>>> +};
>>> +
>>> +const struct iommu_hw iommu_hw_oberon = {
>>> + .ihw_enable = pyro_iommu_enable,
>>> +
>>> + .ihw_dvma_pa = 0x00007fffffffffffUL,
>>> +
>>> + .ihw_bypass = 0x7ffc000000000000UL,
>>> + .ihw_bypass_nc = 0x0000800000000000UL,
>>> + .ihw_bypass_ro = 0x8000000000000000UL,
>>> +
>>> + .ihw_flags = IOMMU_HW_FLUSH_CACHE,
>>> +};
>>> +
>>> #ifdef DDB
>>> void pyro_xir(void *, int);
>>> #endif
>>> @@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
>>> int tsbsize = 7;
>>> u_int32_t iobase = -1;
>>> char *name;
>>> + const struct iommu_hw *ihw = &iommu_hw_fire;
>>>
>>> is->is_bustag = sc->sc_bust;
>>>
>>> @@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
>>> panic("couldn't malloc iommu name");
>>> snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
>>>
>>> - /* On Oberon, we need to flush the cache. */
>>> if (sc->sc_oberon)
>>> - is->is_flags |= IOMMU_FLUSH_CACHE;
>>> + ihw = &iommu_hw_oberon;
>>> +
>>> + iommu_init(name, ihw, is, tsbsize, iobase);
>>> +}
>>> +
>>> +void
>>> +pyro_iommu_enable(struct iommu_state *is)
>>> +{
>>> + unsigned long cr;
>>> +
>>> + cr = IOMMUREG_READ(is, iommu_cr);
>>> + cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
>>> +    IOMMUCR_FIRE_TE;
>>>
>>> - iommu_init(name, is, tsbsize, iobase);
>>> + IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
>>> + IOMMUREG_WRITE(is, iommu_cr, cr);
>>> }
>>>
>>> void
>>> Index: dev/sbus.c
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
>>> retrieving revision 1.44
>>> diff -u -p -r1.44 sbus.c
>>> --- dev/sbus.c 19 Sep 2015 21:07:04 -0000 1.44
>>> +++ dev/sbus.c 10 May 2017 12:00:09 -0000
>>> @@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
>>> snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
>>>
>>> printf("%s: ", sc->sc_dev.dv_xname);
>>> - iommu_init(name, &sc->sc_is, 0, -1);
>>> + iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
>>>
>>> /* Initialize Starfire PC interrupt translation. */
>>> if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
>>> Index: dev/schizo.c
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
>>> retrieving revision 1.67
>>> diff -u -p -r1.67 schizo.c
>>> --- dev/schizo.c 23 Aug 2016 03:28:01 -0000 1.67
>>> +++ dev/schizo.c 10 May 2017 12:00:09 -0000
>>> @@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
>>>    "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
>>> }
>>>
>>> - iommu_init(name, is, tsbsize, iobase);
>>> + iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
>>> }
>>>
>>> int
>>> Index: include/pci_machdep.h
>>> ===================================================================
>>> RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
>>> retrieving revision 1.33
>>> diff -u -p -r1.33 pci_machdep.h
>>> --- include/pci_machdep.h 4 May 2016 14:30:01 -0000 1.33
>>> +++ include/pci_machdep.h 10 May 2017 12:00:09 -0000
>>> @@ -74,10 +74,13 @@ struct sparc_pci_chipset {
>>> pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
>>> void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
>>> int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
>>> + int (*probe_device_hook)(void *, struct pci_attach_args *);
>>> };
>>>
>>> void pci_attach_hook(struct device *, struct device *,
>>>     struct pcibus_attach_args *);
>>> +int pci_probe_device_hook(pci_chipset_tag_t,
>>> +    struct pci_attach_args *);
>>> int pci_bus_maxdevs(pci_chipset_tag_t, int);
>>> pcitag_t pci_make_tag(pci_chipset_tag_t, int, int, int);
>>> void pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int *, int *,
>>> @@ -102,8 +105,6 @@ int sparc64_pci_enumerate_bus(struct pc
>>>    struct pci_attach_args *);
>>>
>>> #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
>>> -
>>> -#define pci_probe_device_hook(c, a) (0)
>>>
>>> #define pci_min_powerstate(c, t) (PCI_PMCSR_STATE_D3)
>>> #define pci_set_powerstate_md(c, t, s, p)

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

David Gwynne-5
In reply to this post by Andrew Grillet


> On 19 Oct 2018, at 9:59 pm, Andrew Grillet <[hidden email]> wrote:
>
> Is the setup and teardown per transfer or when file is opened and closed?
> Or is it set up once per context switch of task?
>
> I am partly interested cos I would like to improve mt one day (as user of
> tape
> and Sparc64 Txxx) if I get the time.
>
> Andrew

The overhead is per transfer. You might not get better performance out of a tx000 because of the PCIe bridges involved, but you may also be lucky and not have that bridge in the way.

>
>
>
> On Fri, 19 Oct 2018 at 10:22, Mark Kettenis <[hidden email]> wrote:
>
>>> Date: Fri, 19 Oct 2018 10:22:30 +1000
>>> From: David Gwynne <[hidden email]>
>>>
>>> On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
>>>> On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
>>>>> on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
>>>>> setting up and tearing down the translation table entries (TTEs)
>>>>> is very expensive. so expensive that the cost of doing it for disk
>>>>> io has a noticable impact on compile times.
>>>>>
>>>>> now that there's a BUS_DMA_64BIT flag, we can use that to decide
>>>>> to bypass the iommu for devices that set that flag, therefore
>>>>> avoiding the cost of handling the TTEs.
>>>>>
>>>>> the following diff adds support for bypass mappings to the iommu
>>>>> code on sparc64. it's based on a diff from kettenis@ back in 2009.
>>>>> the main changes are around coping with the differences between
>>>>> schizo/psycho and fire/oberon.
>>>>>
>>>>> the differences between the chips are now represented by a iommu_hw
>>>>> struct. these differences include how to enable the iommu (now via
>>>>> a function pointer), and masks for bypass addresses.
>>>>>
>>>>> ive tested this on oberon (on an m4000) and schizo (on a v880).
>>>>> however, the bypass code isnt working on fire (v245s). to cope with
>>>>> that for now, the iommu_hw struct lets drivers mask flag bits that
>>>>> are handled when creating a dmamap. this means fire boards will
>>>>> ignore BUS_DMA_64BIT until i can figure out whats wrong with them.
>>>>
>>>> i figured it out. it turns out Fire was working fine. however,
>>>> enabling 64bit dva on the onboard devices didnt work because the
>>>> serverworks/broadcom pcie to pcix bridge can only handle dma addresses
>>>> in the low 40 bits. because the fire bypass window is higher than
>>>> this, the bridge would choke and things stopped working.
>>>>
>>>> the updated diff attempts to handle this. basically when probing
>>>> the bridge, the platform creates a custom dma tag for it. this tag
>>>> intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
>>>> handing it up to the parent bridge, which is pyro in my situation.
>>>> it looks like early sun4v boxes could make use of this too.
>>>>
>>>>> i have not tested this on psycho yet. if anyone has such a machine
>>>>> and is willing to work with me to figure it out, please talk to me.
>>>>
>>>> i still dont have psycho reports.
>>>
>>> Would anyone object if I committed this? I've been running it for the
>>> last release or two without issues, but with significant improvements in
>>> performance on the machines involved.
>>
>> At the price of giving all PCI devices unrestricted access to memory.
>>
>> So I'm not eager to this, especially since on sun4v hardware bypassing
>> the iommu isn't possible as soon as multiple domains are enabled.  And
>> we lose a useful diagnostic when developing drivers.  Are you sure the
>> iommu overhead can't be reduced some other way?  At some point we
>> probably want to add iommu support on amd64 and arm64, but if that
>> comes with a similar overhead as on sparc64 that's going to be a bit
>> of an issue.
>>
>>>> Index: dev/iommu.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
>>>> retrieving revision 1.74
>>>> diff -u -p -r1.74 iommu.c
>>>> --- dev/iommu.c     30 Apr 2017 16:45:45 -0000      1.74
>>>> +++ dev/iommu.c     10 May 2017 12:00:09 -0000
>>>> @@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
>>>> void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
>>>>     bus_addr_t, bus_size_t, int);
>>>>
>>>> +void iommu_hw_enable(struct iommu_state *);
>>>> +
>>>> +const struct iommu_hw iommu_hw_default = {
>>>> +   .ihw_enable     = iommu_hw_enable,
>>>> +
>>>> +   .ihw_dvma_pa    = IOTTE_PAMASK,
>>>> +
>>>> +   .ihw_bypass     = 0x3fffUL << 50,
>>>> +   .ihw_bypass_nc  = 0,
>>>> +   .ihw_bypass_ro  = 0,
>>>> +};
>>>> +
>>>> +void
>>>> +iommu_hw_enable(struct iommu_state *is)
>>>> +{
>>>> +   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
>>>> +   IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
>>>> +}
>>>> +
>>>> /*
>>>>  * Initiate an STC entry flush.
>>>>  */
>>>> @@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
>>>>  * - create a private DVMA map.
>>>>  */
>>>> void
>>>> -iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t
>> iovabase)
>>>> +iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state
>> *is,
>>>> +    int tsbsize, u_int32_t iovabase)
>>>> {
>>>>    psize_t size;
>>>>    vaddr_t va;
>>>> @@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
>>>>     * be hard-wired, so we read the start and size from the PROM and
>>>>     * just use those values.
>>>>     */
>>>> -   if (strncmp(name, "pyro", 4) == 0) {
>>>> -           is->is_cr = IOMMUREG_READ(is, iommu_cr);
>>>> -           is->is_cr &= ~IOMMUCR_FIRE_BE;
>>>> -           is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
>>>> -               IOMMUCR_FIRE_TE);
>>>> -   } else
>>>> -           is->is_cr = IOMMUCR_EN;
>>>> +
>>>> +   is->is_hw = ihw;
>>>> +
>>>>    is->is_tsbsize = tsbsize;
>>>>    if (iovabase == (u_int32_t)-1) {
>>>>            is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
>>>> @@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
>>>>    mtx_init(&is->is_mtx, IPL_HIGH);
>>>>
>>>>    /*
>>>> -    * Set the TSB size.  The relevant bits were moved to the TSB
>>>> -    * base register in the PCIe host bridges.
>>>> -    */
>>>> -   if (strncmp(name, "pyro", 4) == 0)
>>>> -           is->is_ptsb |= is->is_tsbsize;
>>>> -   else
>>>> -           is->is_cr |= (is->is_tsbsize << 16);
>>>> -
>>>> -   /*
>>>>     * Now actually start up the IOMMU.
>>>>     */
>>>>    iommu_reset(is);
>>>> @@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
>>>> {
>>>>    int i;
>>>>
>>>> -   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
>>>> -
>>>> -   /* Enable IOMMU */
>>>> -   IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
>>>> +   (*is->is_hw->ihw_enable)(is);
>>>>
>>>>    for (i = 0; i < 2; ++i) {
>>>>            struct strbuf_ctl *sb = is->is_sb[i];
>>>> @@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
>>>>                    printf(", STC%d enabled", i);
>>>>    }
>>>>
>>>> -   if (is->is_flags & IOMMU_FLUSH_CACHE)
>>>> +   if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
>>>>            IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
>>>> }
>>>>
>>>> @@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
>>>>    if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
>>>>            tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
>>>>
>>>> -   return (tte & IOTTE_PAMASK);
>>>> +   return (tte & is->is_hw->ihw_dvma_pa);
>>>> }
>>>>
>>>> /*
>>>> @@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
>>>> {
>>>>    int ret;
>>>>    bus_dmamap_t map;
>>>> +   struct iommu_state *is = sb->sb_iommu;
>>>>    struct iommu_map_state *ims;
>>>>
>>>>    BUS_DMA_FIND_PARENT(t, _dmamap_create);
>>>> @@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
>>>>    if (ret)
>>>>            return (ret);
>>>>
>>>> +   if (flags & BUS_DMA_64BIT) {
>>>> +           map->_dm_cookie = is;
>>>> +           *dmamap = map;
>>>> +           return (0);
>>>> +   }
>>>> +
>>>>    ims = iommu_iomap_create(atop(round_page(size)));
>>>>
>>>>    if (ims == NULL) {
>>>> @@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
>>>>    if (map->dm_nsegs)
>>>>            bus_dmamap_unload(t0, map);
>>>>
>>>> -        if (map->_dm_cookie)
>>>> -                iommu_iomap_destroy(map->_dm_cookie);
>>>> +   if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>>> +           if (map->_dm_cookie)
>>>> +                   iommu_iomap_destroy(map->_dm_cookie);
>>>> +   }
>>>>    map->_dm_cookie = NULL;
>>>>
>>>>    BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
>>>> @@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
>>>>    u_long dvmaddr, sgstart, sgend;
>>>>    bus_size_t align, boundary;
>>>>    struct iommu_state *is;
>>>> -   struct iommu_map_state *ims = map->_dm_cookie;
>>>> +   struct iommu_map_state *ims;
>>>>    pmap_t pmap;
>>>>
>>>> -#ifdef DIAGNOSTIC
>>>> -   if (ims == NULL)
>>>> -           panic("iommu_dvmamap_load: null map state");
>>>> -#endif
>>>> -#ifdef DEBUG
>>>> -   if (ims->ims_sb == NULL)
>>>> -           panic("iommu_dvmamap_load: null sb");
>>>> -   if (ims->ims_sb->sb_iommu == NULL)
>>>> -           panic("iommu_dvmamap_load: null iommu");
>>>> -#endif /* DEBUG */
>>>> -   is = ims->ims_sb->sb_iommu;
>>>> -
>>>> -   if (map->dm_nsegs) {
>>>> -           /*
>>>> -            * Is it still in use? _bus_dmamap_load should have taken
>> care
>>>> -            * of this.
>>>> -            */
>>>> -#ifdef DIAGNOSTIC
>>>> -           panic("iommu_dvmamap_load: map still in use");
>>>> -#endif
>>>> -           bus_dmamap_unload(t0, map);
>>>> -   }
>>>> -
>>>>    /*
>>>>     * Make sure that on error condition we return "no valid mappings".
>>>>     */
>>>> -   map->dm_nsegs = 0;
>>>> +   KASSERTMSG(map->dm_nsegs == 0, "map still in use");
>>>> +
>>>> +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>>> +           unsigned long bypass;
>>>> +           int i;
>>>> +
>>>> +           is = map->_dm_cookie;
>>>> +           bypass = is->is_hw->ihw_bypass;
>>>> +
>>>> +           /* Bypass translation by the IOMMU. */
>>>> +
>>>> +           BUS_DMA_FIND_PARENT(t, _dmamap_load);
>>>> +           err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p,
>> flags);
>>>> +           if (err != 0)
>>>> +                   return (err);
>>>> +
>>>> +           for (i = 0; i < map->dm_nsegs; i++)
>>>> +                   map->dm_segs[i].ds_addr |= bypass;
>>>> +
>>>> +           return (0);
>>>> +   }
>>>> +
>>>> +   ims = map->_dm_cookie;
>>>> +   is = ims->ims_sb->sb_iommu;
>>>>
>>>>    if (buflen < 1 || buflen > map->_dm_size) {
>>>>            DPRINTF(IDB_BUSDMA,
>>>> @@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
>>>>    bus_size_t boundary, align;
>>>>    u_long dvmaddr, sgstart, sgend;
>>>>    struct iommu_state *is;
>>>> -   struct iommu_map_state *ims = map->_dm_cookie;
>>>> +   struct iommu_map_state *ims;
>>>>
>>>> -#ifdef DIAGNOSTIC
>>>> -   if (ims == NULL)
>>>> -           panic("iommu_dvmamap_load_raw: null map state");
>>>> -#endif
>>>> -#ifdef DEBUG
>>>> -   if (ims->ims_sb == NULL)
>>>> -           panic("iommu_dvmamap_load_raw: null sb");
>>>> -   if (ims->ims_sb->sb_iommu == NULL)
>>>> -           panic("iommu_dvmamap_load_raw: null iommu");
>>>> -#endif /* DEBUG */
>>>> -   is = ims->ims_sb->sb_iommu;
>>>> +   KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
>>>>
>>>> -   if (map->dm_nsegs) {
>>>> -           /* Already in use?? */
>>>> -#ifdef DIAGNOSTIC
>>>> -           panic("iommu_dvmamap_load_raw: map still in use");
>>>> -#endif
>>>> -           bus_dmamap_unload(t0, map);
>>>> +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>>> +           unsigned long bypass;
>>>> +
>>>> +           is = map->_dm_cookie;
>>>> +           bypass = is->is_hw->ihw_bypass;
>>>> +
>>>> +           /* Bypass translation by the IOMMU. */
>>>> +           for (i = 0; i < nsegs; i++) {
>>>> +                   map->dm_segs[i].ds_addr = bypass | segs[i].ds_addr;
>>>> +                   map->dm_segs[i].ds_len = segs[i].ds_len;
>>>> +           }
>>>> +
>>>> +           map->dm_nsegs = nsegs;
>>>> +           map->dm_mapsize = size;
>>>> +
>>>> +           return (0);
>>>>    }
>>>>
>>>> +   ims = map->_dm_cookie;
>>>> +   is = ims->ims_sb->sb_iommu;
>>>> +
>>>>    /*
>>>>     * A boundary presented to bus_dmamem_alloc() takes precedence
>>>>     * over boundary in the map.
>>>> @@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
>>>>    bus_dma_segment_t *seg = NULL;
>>>>    int i = map->dm_nsegs;
>>>>
>>>> -#ifdef DEBUG
>>>> -   if (ims == NULL)
>>>> -           panic("iommu_dvmamap_append_range: null map state");
>>>> -#endif
>>>> -
>>>>    sgstart = iommu_iomap_translate(ims, pa);
>>>>    sgend = sgstart + length - 1;
>>>>
>>>> @@ -1298,20 +1309,17 @@ void
>>>> iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t
>> map)
>>>> {
>>>>    struct iommu_state *is;
>>>> -   struct iommu_map_state *ims = map->_dm_cookie;
>>>> +   struct iommu_map_state *ims;
>>>>    bus_addr_t dvmaddr = map->_dm_dvmastart;
>>>>    bus_size_t sgsize = map->_dm_dvmasize;
>>>>    int error;
>>>>
>>>> -#ifdef DEBUG
>>>> -   if (ims == NULL)
>>>> -           panic("iommu_dvmamap_unload: null map state");
>>>> -   if (ims->ims_sb == NULL)
>>>> -           panic("iommu_dvmamap_unload: null sb");
>>>> -   if (ims->ims_sb->sb_iommu == NULL)
>>>> -           panic("iommu_dvmamap_unload: null iommu");
>>>> -#endif /* DEBUG */
>>>> +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
>>>> +           bus_dmamap_unload(t->_parent, map);
>>>> +           return;
>>>> +   }
>>>>
>>>> +   ims = map->_dm_cookie;
>>>>    is = ims->ims_sb->sb_iommu;
>>>>
>>>>    /* Flush the iommu */
>>>> @@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
>>>>            break;
>>>>    }
>>>>
>>>> -   if (map->_dm_cookie) {
>>>> +   if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie !=
>> NULL) {
>>>>            struct iommu_map_state *ims = map->_dm_cookie;
>>>>            struct iommu_page_map *ipm = &ims->ims_map;
>>>>
>>>> @@ -1546,19 +1554,19 @@ void
>>>> iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t
>> map,
>>>>     bus_addr_t offset, bus_size_t len, int ops)
>>>> {
>>>> -   struct iommu_map_state *ims = map->_dm_cookie;
>>>> +   struct iommu_map_state *ims;
>>>>
>>>> -#ifdef DIAGNOSTIC
>>>> -   if (ims == NULL)
>>>> -           panic("iommu_dvmamap_sync: null map state");
>>>> -   if (ims->ims_sb == NULL)
>>>> -           panic("iommu_dvmamap_sync: null sb");
>>>> -   if (ims->ims_sb->sb_iommu == NULL)
>>>> -           panic("iommu_dvmamap_sync: null iommu");
>>>> -#endif
>>>>    if (len == 0)
>>>>            return;
>>>>
>>>> +   if (map->_dm_flags & BUS_DMA_64BIT) {
>>>> +           if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
>>>> +                   membar(MemIssue);
>>>> +           return;
>>>> +   }
>>>> +
>>>> +   ims = map->_dm_cookie;
>>>> +
>>>>    if (ops & BUS_DMASYNC_PREWRITE)
>>>>            membar(MemIssue);
>>>>
>>>> @@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
>>>>        "bound %llx segp %p flags %d\n", (unsigned long long)size,
>>>>        (unsigned long long)alignment, (unsigned long long)boundary,
>>>>        segs, flags));
>>>> +
>>>> +   if ((flags & BUS_DMA_64BIT) == 0)
>>>> +           flags |= BUS_DMA_DVMA;
>>>> +
>>>>    BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
>>>>    return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
>>>> -       segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
>>>> +       segs, nsegs, rsegs, flags));
>>>> }
>>>>
>>>> void
>>>> @@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
>>>>
>>>>            /* Flush cache if necessary. */
>>>>            slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
>>>> -           if (is->is_flags & IOMMU_FLUSH_CACHE &&
>>>> +           if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
>>>>                (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
>>>>                    IOMMUREG_WRITE(is, iommu_cache_flush,
>>>>                        is->is_ptsb + slot * 8);
>>>> @@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
>>>>
>>>>            /* Flush cache if necessary. */
>>>>            slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
>>>> -           if (is->is_flags & IOMMU_FLUSH_CACHE &&
>>>> +           if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
>>>>                (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
>>>>                    IOMMUREG_WRITE(is, iommu_cache_flush,
>>>>                        is->is_ptsb + slot * 8);
>>>> Index: dev/iommureg.h
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
>>>> retrieving revision 1.17
>>>> diff -u -p -r1.17 iommureg.h
>>>> --- dev/iommureg.h  17 Aug 2012 20:46:50 -0000      1.17
>>>> +++ dev/iommureg.h  10 May 2017 12:00:09 -0000
>>>> @@ -90,10 +90,11 @@ struct iommu_strbuf {
>>>> #define IOMMUCR_DE         0x000000000000000002LL  /* Diag enable */
>>>> #define IOMMUCR_EN         0x000000000000000001LL  /* Enable IOMMU */
>>>>
>>>> -#define IOMMUCR_FIRE_SE            0x000000000000000400LL  /* Snoop
>> enable */
>>>> -#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode
>> enable */
>>>> -#define IOMMUCR_FIRE_BE            0x000000000000000002LL  /* Bypass
>> enable */
>>>> -#define IOMMUCR_FIRE_TE            0x000000000000000001LL  /*
>> Translation enabled */
>>>> +#define IOMMUCR_FIRE_PD            0x000000000000001000UL  /* Process
>> disable */
>>>> +#define IOMMUCR_FIRE_SE            0x000000000000000400UL  /* Snoop
>> enable */
>>>> +#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode
>> enable */
>>>> +#define IOMMUCR_FIRE_BE            0x000000000000000002UL  /* Bypass
>> enable */
>>>> +#define IOMMUCR_FIRE_TE            0x000000000000000001UL  /*
>> Translation enabled */
>>>>
>>>> /*
>>>>  * IOMMU stuff
>>>> Index: dev/iommuvar.h
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
>>>> retrieving revision 1.17
>>>> diff -u -p -r1.17 iommuvar.h
>>>> --- dev/iommuvar.h  4 May 2016 18:26:12 -0000       1.17
>>>> +++ dev/iommuvar.h  10 May 2017 12:00:09 -0000
>>>> @@ -100,6 +100,21 @@ struct iommu_map_state {
>>>> };
>>>> #define IOMMU_MAP_STREAM   1
>>>>
>>>> +struct iommu_hw {
>>>> +   void                    (*ihw_enable)(struct iommu_state *);
>>>> +
>>>> +   unsigned long           ihw_dvma_pa;
>>>> +
>>>> +   unsigned long           ihw_bypass;
>>>> +   unsigned long           ihw_bypass_nc;          /* non-cached */
>>>> +   unsigned long           ihw_bypass_ro;          /* relaxed
>> ordering */
>>>> +
>>>> +   unsigned int            ihw_flags;
>>>> +#define IOMMU_HW_FLUSH_CACHE               (1 << 0)
>>>> +};
>>>> +
>>>> +extern const struct iommu_hw iommu_hw_default;
>>>> +
>>>> /*
>>>>  * per-IOMMU state
>>>>  */
>>>> @@ -112,8 +127,7 @@ struct iommu_state {
>>>>    int64_t                 is_cr;          /* Control register value
>> */
>>>>    struct mutex            is_mtx;
>>>>    struct extent           *is_dvmamap;    /* DVMA map for this
>> instance */
>>>> -   int                     is_flags;
>>>> -#define IOMMU_FLUSH_CACHE  0x00000001
>>>> +   const struct iommu_hw   *is_hw;
>>>>
>>>>    struct strbuf_ctl       *is_sb[2];      /* Streaming buffers if
>> any */
>>>>
>>>> @@ -126,7 +140,8 @@ struct iommu_state {
>>>> };
>>>>
>>>> /* interfaces for PCI/SBus code */
>>>> -void       iommu_init(char *, struct iommu_state *, int, u_int32_t);
>>>> +void       iommu_init(char *, const struct iommu_hw *, struct
>> iommu_state *,
>>>> +    int, u_int32_t);
>>>> void       iommu_reset(struct iommu_state *);
>>>> paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
>>>> int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
>>>> @@ -146,6 +161,7 @@ int     iommu_dvmamem_alloc(bus_dma_tag_t, b
>>>>        bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
>>>> void       iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t,
>> bus_dma_segment_t *,
>>>>        int);
>>>> +
>>>>
>>>> #define IOMMUREG_READ(is, reg)                             \
>>>>    bus_space_read_8((is)->is_bustag,               \
>>>> Index: dev/pci_machdep.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
>>>> retrieving revision 1.44
>>>> diff -u -p -r1.44 pci_machdep.c
>>>> --- dev/pci_machdep.c       10 May 2014 12:15:19 -0000      1.44
>>>> +++ dev/pci_machdep.c       10 May 2017 12:00:09 -0000
>>>> @@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
>>>> #include <machine/openfirm.h>
>>>> #include <dev/pci/pcivar.h>
>>>> #include <dev/pci/pcireg.h>
>>>> +#include <dev/pci/pcidevs.h>
>>>>
>>>> #include <dev/ofw/ofw_pci.h>
>>>>
>>>> @@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
>>>>    struct pcibus_attach_args *pba;
>>>> {
>>>>    /* Don't do anything */
>>>> +}
>>>> +
>>>> +int
>>>> +pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t
>> size,
>>>> +    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int
>> flags,
>>>> +    bus_dmamap_t *dmamp)
>>>> +{
>>>> +   bus_dma_tag_t pdt = dt->_parent;
>>>> +
>>>> +   CLR(flags, BUS_DMA_64BIT);
>>>> +
>>>> +   return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
>>>> +       boundary, flags, dmamp));
>>>> +}
>>>> +
>>>> +int
>>>> +pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args
>> *pa)
>>>> +{
>>>> +   bus_dma_tag_t dt, pdt;
>>>> +
>>>> +   if (pa->pa_id ==
>>>> +       PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
>>>> +           /*
>>>> +            * These PCI bridges only support 40bit DVA, so intercept
>>>> +            * bus_dmamap_create so we can clear BUS_DMA_64BIT.
>>>> +            */
>>>> +
>>>> +           dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
>>>> +           if (dt == NULL)
>>>> +                   panic("%s: could not alloc dma tag", __func__);
>>>> +
>>>> +           pdt = pa->pa_dmat;
>>>> +
>>>> +           dt->_parent = pdt;
>>>> +           dt->_dmamap_create = pci_bcm_dmamap_create;
>>>> +
>>>> +           pa->pa_dmat = dt;
>>>> +   }
>>>> +
>>>> +   return (0);
>>>> }
>>>>
>>>> int
>>>> Index: dev/psycho.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
>>>> retrieving revision 1.74
>>>> diff -u -p -r1.74 psycho.c
>>>> --- dev/psycho.c    23 Aug 2016 03:28:01 -0000      1.74
>>>> +++ dev/psycho.c    10 May 2017 12:00:09 -0000
>>>> @@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
>>>>            panic("couldn't malloc iommu name");
>>>>    snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
>>>>
>>>> -   iommu_init(name, is, tsbsize, iobase);
>>>> +   iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
>>>> }
>>>>
>>>> /*
>>>> Index: dev/pyro.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
>>>> retrieving revision 1.30
>>>> diff -u -p -r1.30 pyro.c
>>>> --- dev/pyro.c      20 Dec 2016 13:40:50 -0000      1.30
>>>> +++ dev/pyro.c      10 May 2017 12:00:09 -0000
>>>> @@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
>>>> int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
>>>>     bus_size_t, bus_size_t, int, bus_dmamap_t *);
>>>>
>>>> +void pyro_iommu_enable(struct iommu_state *);
>>>> +
>>>> +const struct iommu_hw iommu_hw_fire = {
>>>> +   .ihw_enable     = pyro_iommu_enable,
>>>> +
>>>> +   .ihw_dvma_pa    = 0x000007ffffffffffUL,
>>>> +
>>>> +   .ihw_bypass     = 0xfffc000000000000UL,
>>>> +   .ihw_bypass_nc  = 0x0000080000000000UL,
>>>> +   .ihw_bypass_ro  = 0,
>>>> +};
>>>> +
>>>> +const struct iommu_hw iommu_hw_oberon = {
>>>> +   .ihw_enable     = pyro_iommu_enable,
>>>> +
>>>> +   .ihw_dvma_pa    = 0x00007fffffffffffUL,
>>>> +
>>>> +   .ihw_bypass     = 0x7ffc000000000000UL,
>>>> +   .ihw_bypass_nc  = 0x0000800000000000UL,
>>>> +   .ihw_bypass_ro  = 0x8000000000000000UL,
>>>> +
>>>> +   .ihw_flags      = IOMMU_HW_FLUSH_CACHE,
>>>> +};
>>>> +
>>>> #ifdef DDB
>>>> void pyro_xir(void *, int);
>>>> #endif
>>>> @@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
>>>>    int tsbsize = 7;
>>>>    u_int32_t iobase = -1;
>>>>    char *name;
>>>> +   const struct iommu_hw *ihw = &iommu_hw_fire;
>>>>
>>>>    is->is_bustag = sc->sc_bust;
>>>>
>>>> @@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
>>>>            panic("couldn't malloc iommu name");
>>>>    snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
>>>>
>>>> -   /* On Oberon, we need to flush the cache. */
>>>>    if (sc->sc_oberon)
>>>> -           is->is_flags |= IOMMU_FLUSH_CACHE;
>>>> +           ihw = &iommu_hw_oberon;
>>>> +
>>>> +   iommu_init(name, ihw, is, tsbsize, iobase);
>>>> +}
>>>> +
>>>> +void
>>>> +pyro_iommu_enable(struct iommu_state *is)
>>>> +{
>>>> +   unsigned long cr;
>>>> +
>>>> +   cr = IOMMUREG_READ(is, iommu_cr);
>>>> +   cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
>>>> +       IOMMUCR_FIRE_TE;
>>>>
>>>> -   iommu_init(name, is, tsbsize, iobase);
>>>> +   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
>>>> +   IOMMUREG_WRITE(is, iommu_cr, cr);
>>>> }
>>>>
>>>> void
>>>> Index: dev/sbus.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
>>>> retrieving revision 1.44
>>>> diff -u -p -r1.44 sbus.c
>>>> --- dev/sbus.c      19 Sep 2015 21:07:04 -0000      1.44
>>>> +++ dev/sbus.c      10 May 2017 12:00:09 -0000
>>>> @@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
>>>>    snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
>>>>
>>>>    printf("%s: ", sc->sc_dev.dv_xname);
>>>> -   iommu_init(name, &sc->sc_is, 0, -1);
>>>> +   iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
>>>>
>>>>    /* Initialize Starfire PC interrupt translation. */
>>>>    if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
>>>> Index: dev/schizo.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
>>>> retrieving revision 1.67
>>>> diff -u -p -r1.67 schizo.c
>>>> --- dev/schizo.c    23 Aug 2016 03:28:01 -0000      1.67
>>>> +++ dev/schizo.c    10 May 2017 12:00:09 -0000
>>>> @@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
>>>>                "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
>>>>    }
>>>>
>>>> -   iommu_init(name, is, tsbsize, iobase);
>>>> +   iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
>>>> }
>>>>
>>>> int
>>>> Index: include/pci_machdep.h
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
>>>> retrieving revision 1.33
>>>> diff -u -p -r1.33 pci_machdep.h
>>>> --- include/pci_machdep.h   4 May 2016 14:30:01 -0000       1.33
>>>> +++ include/pci_machdep.h   10 May 2017 12:00:09 -0000
>>>> @@ -74,10 +74,13 @@ struct sparc_pci_chipset {
>>>>    pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
>>>>    void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
>>>>    int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
>>>> +   int (*probe_device_hook)(void *, struct pci_attach_args *);
>>>> };
>>>>
>>>> void               pci_attach_hook(struct device *, struct device *,
>>>>                                 struct pcibus_attach_args *);
>>>> +int                pci_probe_device_hook(pci_chipset_tag_t,
>>>> +               struct pci_attach_args *);
>>>> int                pci_bus_maxdevs(pci_chipset_tag_t, int);
>>>> pcitag_t   pci_make_tag(pci_chipset_tag_t, int, int, int);
>>>> void               pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int
>> *, int *,
>>>> @@ -102,8 +105,6 @@ int             sparc64_pci_enumerate_bus(struct pc
>>>>                struct pci_attach_args *);
>>>>
>>>> #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
>>>> -
>>>> -#define    pci_probe_device_hook(c, a)     (0)
>>>>
>>>> #define    pci_min_powerstate(c, t)        (PCI_PMCSR_STATE_D3)
>>>> #define    pci_set_powerstate_md(c, t, s, p)
>>>
>>>
>>
>>

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Joseph Mayer
In reply to this post by Mark Kettenis
‐‐‐‐‐‐‐ Original Message ‐‐‐‐‐‐‐
On Friday, October 19, 2018 5:15 PM, Mark Kettenis <[hidden email]> wrote:

> > Date: Fri, 19 Oct 2018 10:22:30 +1000
> > From: David Gwynne [hidden email]
> > On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
> >
> > > On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
> > >
> > > > on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> > > > setting up and tearing down the translation table entries (TTEs)
> > > > is very expensive. so expensive that the cost of doing it for disk
> > > > io has a noticable impact on compile times.
> > > > now that there's a BUS_DMA_64BIT flag, we can use that to decide
> > > > to bypass the iommu for devices that set that flag, therefore
> > > > avoiding the cost of handling the TTEs.

Question for the unintroduced, what's the scope here, TTE is Sparc's
page table and reconfiguring them at (process) context switch is
expensive and this suggestion removes the need for TTE:s for hardware
device access, but those don't change at context switch?

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

David Gwynne-5


> On 20 Oct 2018, at 11:56 am, Joseph Mayer <[hidden email]> wrote:
>
> ‐‐‐‐‐‐‐ Original Message ‐‐‐‐‐‐‐
> On Friday, October 19, 2018 5:15 PM, Mark Kettenis <[hidden email]> wrote:
>
>>> Date: Fri, 19 Oct 2018 10:22:30 +1000
>>> From: David Gwynne [hidden email]
>>> On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
>>>
>>>> On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
>>>>
>>>>> on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
>>>>> setting up and tearing down the translation table entries (TTEs)
>>>>> is very expensive. so expensive that the cost of doing it for disk
>>>>> io has a noticable impact on compile times.
>>>>> now that there's a BUS_DMA_64BIT flag, we can use that to decide
>>>>> to bypass the iommu for devices that set that flag, therefore
>>>>> avoiding the cost of handling the TTEs.
>
> Question for the unintroduced, what's the scope here, TTE is Sparc's
> page table and reconfiguring them at (process) context switch is
> expensive and this suggestion removes the need for TTE:s for hardware
> device access, but those don't change at context switch?

We're talking about an IOMMU here, not a traditional MMU providing virtual addresses for programs. An IOMMU sits between physical memory and the devices in a machine. It allows DMA addresses to mapped to different parts of physical memory. Mapping physical memory to a DMA virtual address (or dva) is how a device that only understands 32bit addresses can work in a 64bit machine. Memory at high addresses gets mapped to a low dva.

This is done at runtime on OpenBSD when DMA mappings are loaded or unloaded by populating Translation Table Entries (TTEs). A TTE is effectively a table or array mapping DVA pages to physical addresses. Generally device drivers load and unload dma memory for every I/O or packet or so on.

IOMMUs in sparc64s have some more features than this. Because they really are between memory and the devices they can act as a gatekeeper for all memory accesses. They also have a toggle that can allow a device to have direct or passthru access to physical memory. If passthru is enabled, there's a special address range that effectively maps all physical memory into a DVA range. Devices can be pointed at it without having to manage TTEs. When passthru is disabled, all accesses must go through TTEs.

Currently OpenBSD disables passthru. The benefit is devices can't blindly access sensitive memory unless it is explicitly shared. Note that this is how it is on most architectures anyway. However, the consequence of managing the TTEs is that it is expensive, and extremely so in some cases.

dlg

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Joseph Mayer
On Saturday, October 20, 2018 10:14 AM, David Gwynne <[hidden email]> wrote:

> > On 20 Oct 2018, at 11:56 am, Joseph Mayer [hidden email] wrote:
> > ‐‐‐‐‐‐‐ Original Message ‐‐‐‐‐‐‐
> > On Friday, October 19, 2018 5:15 PM, Mark Kettenis [hidden email] wrote:
> >
> > > > Date: Fri, 19 Oct 2018 10:22:30 +1000
> > > > From: David Gwynne [hidden email]
> > > > On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
> > > >
> > > > > On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
> > > > >
> > > > > > on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> > > > > > setting up and tearing down the translation table entries (TTEs)
> > > > > > is very expensive. so expensive that the cost of doing it for disk
> > > > > > io has a noticable impact on compile times.
> > > > > > now that there's a BUS_DMA_64BIT flag, we can use that to decide
> > > > > > to bypass the iommu for devices that set that flag, therefore
> > > > > > avoiding the cost of handling the TTEs.
> >
> > Question for the unintroduced, what's the scope here, TTE is Sparc's
> > page table and reconfiguring them at (process) context switch is
> > expensive and this suggestion removes the need for TTE:s for hardware
> > device access, but those don't change at context switch?
>
> We're talking about an IOMMU here, not a traditional MMU providing virtual addresses for programs. An IOMMU sits between physical memory and the devices in a machine. It allows DMA addresses to mapped to different parts of physical memory. Mapping physical memory to a DMA virtual address (or dva) is how a device that only understands 32bit addresses can work in a 64bit machine. Memory at high addresses gets mapped to a low dva.
>
> This is done at runtime on OpenBSD when DMA mappings are loaded or unloaded by populating Translation Table Entries (TTEs). A TTE is effectively a table or array mapping DVA pages to physical addresses. Generally device drivers load and unload dma memory for every I/O or packet or so on.
>
> IOMMUs in sparc64s have some more features than this. Because they really are between memory and the devices they can act as a gatekeeper for all memory accesses. They also have a toggle that can allow a device to have direct or passthru access to physical memory. If passthru is enabled, there's a special address range that effectively maps all physical memory into a DVA range. Devices can be pointed at it without having to manage TTEs. When passthru is disabled, all accesses must go through TTEs.
>
> Currently OpenBSD disables passthru. The benefit is devices can't blindly access sensitive memory unless it is explicitly shared. Note that this is how it is on most architectures anyway. However, the consequence of managing the TTEs is that it is expensive, and extremely so in some cases.
>
> dlg

Last iteration from me on this one.

Why is this not a problem on some other architectures?

I'd have thought DMA and hardware being assigned transitory addresses
(from memory allocator or other OS subsystem or driver) mostly is a
lower level phenomenon and memcpy normally applies on higher levels,
isn't it so - for networking for instance, mbuf's take over soon above
the driver level. Does OpenBSD have a pool of to-be-mbufs and it asks
network drivers to write received ethernet frames directly to them, and
similarly transmit ethernet frames directly from mbuf:s?

What potentially or clearly sensitive memory would passthru expose,
driver-owned structures only or all memory?

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

David Gwynne-5
On Sat, Oct 20, 2018 at 02:44:29AM +0000, Joseph Mayer wrote:
>
> Last iteration from me on this one.
>
> Why is this not a problem on some other architectures?

It is a problem, it's just that other archs don't have an iommu like
sparc64.

> I'd have thought DMA and hardware being assigned transitory addresses
> (from memory allocator or other OS subsystem or driver) mostly is a
> lower level phenomenon and memcpy normally applies on higher levels,
> isn't it so - for networking for instance, mbuf's take over soon above
> the driver level. Does OpenBSD have a pool of to-be-mbufs and it asks
> network drivers to write received ethernet frames directly to them, and
> similarly transmit ethernet frames directly from mbuf:s?

Hrm. There's three views of memory you need to keep in mind here.
Memory has a physical address which gets mapped to virtual addresses
that the kernel and programs see. Finally, there's the DMA address,
which is the address devices use to access physical memory.

On most archs the physical and dma addresses are the same thing. On
archs with an IOMMU or similar, the dma address can be virtual, just
like the kernel addresses are virtual.

When you allocate an mbuf, you're getting a chunk of physical memory
that is mapped into the kernel virtual address space. For a device
to do something with it, the kernel has the bus_dma api that figures
out the dma address of the physical memory behind the kernel virtual
address.

On sparc64, that figuring out involves finding the physical address on
the memory, then allocating and filling TTEs. On amd64, it just has to
get the physical address of the kva and the device can use it directly.

> What potentially or clearly sensitive memory would passthru expose,
> driver-owned structures only or all memory?

Passthru menas a device can access all the physical memory in a
computer. So everything.

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Andrew Grillet
In reply to this post by David Gwynne-5
These days we are not so short of memory - would it not be possible to
allocate an mbuf (or two for double-buffered) for each file
when opened, and free when closed?

I can see the management might be more complex, but the performance
benefits might be considerable.
Also, for VM disk access (ldom on Txxxx) does this mean the process happens
twice -once for disk-to-host
and again for host-to-guest? In which case, allocating mbufs for the entire
vdisk file to the host once
at (VM) boot time (ldomctl start guest), and deallocating when it is shut
down would save huge amounts
of work. Unless the host is not involved in guest file access at all (don't
know how you could safely do
that).

I can't see making all of memory visible to (even to kernel processes) in a
guest is acceptable. Too much to
go wrong.

Andrew

On Sat, 20 Oct 2018 at 01:59, David Gwynne <[hidden email]> wrote:

>
>
> > On 19 Oct 2018, at 9:59 pm, Andrew Grillet <[hidden email]> wrote:
> >
> > Is the setup and teardown per transfer or when file is opened and closed?
> > Or is it set up once per context switch of task?
> >
> > I am partly interested cos I would like to improve mt one day (as user of
> > tape
> > and Sparc64 Txxx) if I get the time.
> >
> > Andrew
>
> The overhead is per transfer. You might not get better performance out of
> a tx000 because of the PCIe bridges involved, but you may also be lucky and
> not have that bridge in the way.
>
> >
> >
> >
> > On Fri, 19 Oct 2018 at 10:22, Mark Kettenis <[hidden email]>
> wrote:
> >
> >>> Date: Fri, 19 Oct 2018 10:22:30 +1000
> >>> From: David Gwynne <[hidden email]>
> >>>
> >>> On Wed, May 10, 2017 at 10:09:59PM +1000, David Gwynne wrote:
> >>>> On Mon, May 08, 2017 at 11:03:58AM +1000, David Gwynne wrote:
> >>>>> on modern sparc64s (think fire or sparc enterprise Mx000 boxes),
> >>>>> setting up and tearing down the translation table entries (TTEs)
> >>>>> is very expensive. so expensive that the cost of doing it for disk
> >>>>> io has a noticable impact on compile times.
> >>>>>
> >>>>> now that there's a BUS_DMA_64BIT flag, we can use that to decide
> >>>>> to bypass the iommu for devices that set that flag, therefore
> >>>>> avoiding the cost of handling the TTEs.
> >>>>>
> >>>>> the following diff adds support for bypass mappings to the iommu
> >>>>> code on sparc64. it's based on a diff from kettenis@ back in 2009.
> >>>>> the main changes are around coping with the differences between
> >>>>> schizo/psycho and fire/oberon.
> >>>>>
> >>>>> the differences between the chips are now represented by a iommu_hw
> >>>>> struct. these differences include how to enable the iommu (now via
> >>>>> a function pointer), and masks for bypass addresses.
> >>>>>
> >>>>> ive tested this on oberon (on an m4000) and schizo (on a v880).
> >>>>> however, the bypass code isnt working on fire (v245s). to cope with
> >>>>> that for now, the iommu_hw struct lets drivers mask flag bits that
> >>>>> are handled when creating a dmamap. this means fire boards will
> >>>>> ignore BUS_DMA_64BIT until i can figure out whats wrong with them.
> >>>>
> >>>> i figured it out. it turns out Fire was working fine. however,
> >>>> enabling 64bit dva on the onboard devices didnt work because the
> >>>> serverworks/broadcom pcie to pcix bridge can only handle dma addresses
> >>>> in the low 40 bits. because the fire bypass window is higher than
> >>>> this, the bridge would choke and things stopped working.
> >>>>
> >>>> the updated diff attempts to handle this. basically when probing
> >>>> the bridge, the platform creates a custom dma tag for it. this tag
> >>>> intercets bus_dmamap_create and clears the BUS_DMA_64BIT flag before
> >>>> handing it up to the parent bridge, which is pyro in my situation.
> >>>> it looks like early sun4v boxes could make use of this too.
> >>>>
> >>>>> i have not tested this on psycho yet. if anyone has such a machine
> >>>>> and is willing to work with me to figure it out, please talk to me.
> >>>>
> >>>> i still dont have psycho reports.
> >>>
> >>> Would anyone object if I committed this? I've been running it for the
> >>> last release or two without issues, but with significant improvements
> in
> >>> performance on the machines involved.
> >>
> >> At the price of giving all PCI devices unrestricted access to memory.
> >>
> >> So I'm not eager to this, especially since on sun4v hardware bypassing
> >> the iommu isn't possible as soon as multiple domains are enabled.  And
> >> we lose a useful diagnostic when developing drivers.  Are you sure the
> >> iommu overhead can't be reduced some other way?  At some point we
> >> probably want to add iommu support on amd64 and arm64, but if that
> >> comes with a similar overhead as on sparc64 that's going to be a bit
> >> of an issue.
> >>
> >>>> Index: dev/iommu.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
> >>>> retrieving revision 1.74
> >>>> diff -u -p -r1.74 iommu.c
> >>>> --- dev/iommu.c     30 Apr 2017 16:45:45 -0000      1.74
> >>>> +++ dev/iommu.c     10 May 2017 12:00:09 -0000
> >>>> @@ -100,6 +100,25 @@ void iommu_iomap_clear_pages(struct iomm
> >>>> void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
> >>>>     bus_addr_t, bus_size_t, int);
> >>>>
> >>>> +void iommu_hw_enable(struct iommu_state *);
> >>>> +
> >>>> +const struct iommu_hw iommu_hw_default = {
> >>>> +   .ihw_enable     = iommu_hw_enable,
> >>>> +
> >>>> +   .ihw_dvma_pa    = IOTTE_PAMASK,
> >>>> +
> >>>> +   .ihw_bypass     = 0x3fffUL << 50,
> >>>> +   .ihw_bypass_nc  = 0,
> >>>> +   .ihw_bypass_ro  = 0,
> >>>> +};
> >>>> +
> >>>> +void
> >>>> +iommu_hw_enable(struct iommu_state *is)
> >>>> +{
> >>>> +   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> >>>> +   IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
> >>>> +}
> >>>> +
> >>>> /*
> >>>>  * Initiate an STC entry flush.
> >>>>  */
> >>>> @@ -125,7 +144,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
> >>>>  * - create a private DVMA map.
> >>>>  */
> >>>> void
> >>>> -iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t
> >> iovabase)
> >>>> +iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state
> >> *is,
> >>>> +    int tsbsize, u_int32_t iovabase)
> >>>> {
> >>>>    psize_t size;
> >>>>    vaddr_t va;
> >>>> @@ -149,13 +169,9 @@ iommu_init(char *name, struct iommu_stat
> >>>>     * be hard-wired, so we read the start and size from the PROM and
> >>>>     * just use those values.
> >>>>     */
> >>>> -   if (strncmp(name, "pyro", 4) == 0) {
> >>>> -           is->is_cr = IOMMUREG_READ(is, iommu_cr);
> >>>> -           is->is_cr &= ~IOMMUCR_FIRE_BE;
> >>>> -           is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> >>>> -               IOMMUCR_FIRE_TE);
> >>>> -   } else
> >>>> -           is->is_cr = IOMMUCR_EN;
> >>>> +
> >>>> +   is->is_hw = ihw;
> >>>> +
> >>>>    is->is_tsbsize = tsbsize;
> >>>>    if (iovabase == (u_int32_t)-1) {
> >>>>            is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
> >>>> @@ -237,15 +253,6 @@ iommu_init(char *name, struct iommu_stat
> >>>>    mtx_init(&is->is_mtx, IPL_HIGH);
> >>>>
> >>>>    /*
> >>>> -    * Set the TSB size.  The relevant bits were moved to the TSB
> >>>> -    * base register in the PCIe host bridges.
> >>>> -    */
> >>>> -   if (strncmp(name, "pyro", 4) == 0)
> >>>> -           is->is_ptsb |= is->is_tsbsize;
> >>>> -   else
> >>>> -           is->is_cr |= (is->is_tsbsize << 16);
> >>>> -
> >>>> -   /*
> >>>>     * Now actually start up the IOMMU.
> >>>>     */
> >>>>    iommu_reset(is);
> >>>> @@ -262,10 +269,7 @@ iommu_reset(struct iommu_state *is)
> >>>> {
> >>>>    int i;
> >>>>
> >>>> -   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
> >>>> -
> >>>> -   /* Enable IOMMU */
> >>>> -   IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
> >>>> +   (*is->is_hw->ihw_enable)(is);
> >>>>
> >>>>    for (i = 0; i < 2; ++i) {
> >>>>            struct strbuf_ctl *sb = is->is_sb[i];
> >>>> @@ -280,7 +284,7 @@ iommu_reset(struct iommu_state *is)
> >>>>                    printf(", STC%d enabled", i);
> >>>>    }
> >>>>
> >>>> -   if (is->is_flags & IOMMU_FLUSH_CACHE)
> >>>> +   if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
> >>>>            IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
> >>>> }
> >>>>
> >>>> @@ -433,7 +437,7 @@ iommu_extract(struct iommu_state *is, bu
> >>>>    if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
> >>>>            tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
> >>>>
> >>>> -   return (tte & IOTTE_PAMASK);
> >>>> +   return (tte & is->is_hw->ihw_dvma_pa);
> >>>> }
> >>>>
> >>>> /*
> >>>> @@ -601,6 +605,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
> >>>> {
> >>>>    int ret;
> >>>>    bus_dmamap_t map;
> >>>> +   struct iommu_state *is = sb->sb_iommu;
> >>>>    struct iommu_map_state *ims;
> >>>>
> >>>>    BUS_DMA_FIND_PARENT(t, _dmamap_create);
> >>>> @@ -610,6 +615,12 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
> >>>>    if (ret)
> >>>>            return (ret);
> >>>>
> >>>> +   if (flags & BUS_DMA_64BIT) {
> >>>> +           map->_dm_cookie = is;
> >>>> +           *dmamap = map;
> >>>> +           return (0);
> >>>> +   }
> >>>> +
> >>>>    ims = iommu_iomap_create(atop(round_page(size)));
> >>>>
> >>>>    if (ims == NULL) {
> >>>> @@ -641,8 +652,10 @@ iommu_dvmamap_destroy(bus_dma_tag_t t, b
> >>>>    if (map->dm_nsegs)
> >>>>            bus_dmamap_unload(t0, map);
> >>>>
> >>>> -        if (map->_dm_cookie)
> >>>> -                iommu_iomap_destroy(map->_dm_cookie);
> >>>> +   if (!ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> >>>> +           if (map->_dm_cookie)
> >>>> +                   iommu_iomap_destroy(map->_dm_cookie);
> >>>> +   }
> >>>>    map->_dm_cookie = NULL;
> >>>>
> >>>>    BUS_DMA_FIND_PARENT(t, _dmamap_destroy);
> >>>> @@ -667,36 +680,36 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_
> >>>>    u_long dvmaddr, sgstart, sgend;
> >>>>    bus_size_t align, boundary;
> >>>>    struct iommu_state *is;
> >>>> -   struct iommu_map_state *ims = map->_dm_cookie;
> >>>> +   struct iommu_map_state *ims;
> >>>>    pmap_t pmap;
> >>>>
> >>>> -#ifdef DIAGNOSTIC
> >>>> -   if (ims == NULL)
> >>>> -           panic("iommu_dvmamap_load: null map state");
> >>>> -#endif
> >>>> -#ifdef DEBUG
> >>>> -   if (ims->ims_sb == NULL)
> >>>> -           panic("iommu_dvmamap_load: null sb");
> >>>> -   if (ims->ims_sb->sb_iommu == NULL)
> >>>> -           panic("iommu_dvmamap_load: null iommu");
> >>>> -#endif /* DEBUG */
> >>>> -   is = ims->ims_sb->sb_iommu;
> >>>> -
> >>>> -   if (map->dm_nsegs) {
> >>>> -           /*
> >>>> -            * Is it still in use? _bus_dmamap_load should have taken
> >> care
> >>>> -            * of this.
> >>>> -            */
> >>>> -#ifdef DIAGNOSTIC
> >>>> -           panic("iommu_dvmamap_load: map still in use");
> >>>> -#endif
> >>>> -           bus_dmamap_unload(t0, map);
> >>>> -   }
> >>>> -
> >>>>    /*
> >>>>     * Make sure that on error condition we return "no valid mappings".
> >>>>     */
> >>>> -   map->dm_nsegs = 0;
> >>>> +   KASSERTMSG(map->dm_nsegs == 0, "map still in use");
> >>>> +
> >>>> +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> >>>> +           unsigned long bypass;
> >>>> +           int i;
> >>>> +
> >>>> +           is = map->_dm_cookie;
> >>>> +           bypass = is->is_hw->ihw_bypass;
> >>>> +
> >>>> +           /* Bypass translation by the IOMMU. */
> >>>> +
> >>>> +           BUS_DMA_FIND_PARENT(t, _dmamap_load);
> >>>> +           err = (*t->_dmamap_load)(t, t0, map, buf, buflen, p,
> >> flags);
> >>>> +           if (err != 0)
> >>>> +                   return (err);
> >>>> +
> >>>> +           for (i = 0; i < map->dm_nsegs; i++)
> >>>> +                   map->dm_segs[i].ds_addr |= bypass;
> >>>> +
> >>>> +           return (0);
> >>>> +   }
> >>>> +
> >>>> +   ims = map->_dm_cookie;
> >>>> +   is = ims->ims_sb->sb_iommu;
> >>>>
> >>>>    if (buflen < 1 || buflen > map->_dm_size) {
> >>>>            DPRINTF(IDB_BUSDMA,
> >>>> @@ -876,28 +889,31 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t,
> >>>>    bus_size_t boundary, align;
> >>>>    u_long dvmaddr, sgstart, sgend;
> >>>>    struct iommu_state *is;
> >>>> -   struct iommu_map_state *ims = map->_dm_cookie;
> >>>> +   struct iommu_map_state *ims;
> >>>>
> >>>> -#ifdef DIAGNOSTIC
> >>>> -   if (ims == NULL)
> >>>> -           panic("iommu_dvmamap_load_raw: null map state");
> >>>> -#endif
> >>>> -#ifdef DEBUG
> >>>> -   if (ims->ims_sb == NULL)
> >>>> -           panic("iommu_dvmamap_load_raw: null sb");
> >>>> -   if (ims->ims_sb->sb_iommu == NULL)
> >>>> -           panic("iommu_dvmamap_load_raw: null iommu");
> >>>> -#endif /* DEBUG */
> >>>> -   is = ims->ims_sb->sb_iommu;
> >>>> +   KASSERTMSG(map->dm_nsegs == 0, "map stil in use");
> >>>>
> >>>> -   if (map->dm_nsegs) {
> >>>> -           /* Already in use?? */
> >>>> -#ifdef DIAGNOSTIC
> >>>> -           panic("iommu_dvmamap_load_raw: map still in use");
> >>>> -#endif
> >>>> -           bus_dmamap_unload(t0, map);
> >>>> +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> >>>> +           unsigned long bypass;
> >>>> +
> >>>> +           is = map->_dm_cookie;
> >>>> +           bypass = is->is_hw->ihw_bypass;
> >>>> +
> >>>> +           /* Bypass translation by the IOMMU. */
> >>>> +           for (i = 0; i < nsegs; i++) {
> >>>> +                   map->dm_segs[i].ds_addr = bypass |
> segs[i].ds_addr;
> >>>> +                   map->dm_segs[i].ds_len = segs[i].ds_len;
> >>>> +           }
> >>>> +
> >>>> +           map->dm_nsegs = nsegs;
> >>>> +           map->dm_mapsize = size;
> >>>> +
> >>>> +           return (0);
> >>>>    }
> >>>>
> >>>> +   ims = map->_dm_cookie;
> >>>> +   is = ims->ims_sb->sb_iommu;
> >>>> +
> >>>>    /*
> >>>>     * A boundary presented to bus_dmamem_alloc() takes precedence
> >>>>     * over boundary in the map.
> >>>> @@ -1088,11 +1104,6 @@ iommu_dvmamap_append_range(bus_dma_tag_t
> >>>>    bus_dma_segment_t *seg = NULL;
> >>>>    int i = map->dm_nsegs;
> >>>>
> >>>> -#ifdef DEBUG
> >>>> -   if (ims == NULL)
> >>>> -           panic("iommu_dvmamap_append_range: null map state");
> >>>> -#endif
> >>>> -
> >>>>    sgstart = iommu_iomap_translate(ims, pa);
> >>>>    sgend = sgstart + length - 1;
> >>>>
> >>>> @@ -1298,20 +1309,17 @@ void
> >>>> iommu_dvmamap_unload(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t
> >> map)
> >>>> {
> >>>>    struct iommu_state *is;
> >>>> -   struct iommu_map_state *ims = map->_dm_cookie;
> >>>> +   struct iommu_map_state *ims;
> >>>>    bus_addr_t dvmaddr = map->_dm_dvmastart;
> >>>>    bus_size_t sgsize = map->_dm_dvmasize;
> >>>>    int error;
> >>>>
> >>>> -#ifdef DEBUG
> >>>> -   if (ims == NULL)
> >>>> -           panic("iommu_dvmamap_unload: null map state");
> >>>> -   if (ims->ims_sb == NULL)
> >>>> -           panic("iommu_dvmamap_unload: null sb");
> >>>> -   if (ims->ims_sb->sb_iommu == NULL)
> >>>> -           panic("iommu_dvmamap_unload: null iommu");
> >>>> -#endif /* DEBUG */
> >>>> +   if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) {
> >>>> +           bus_dmamap_unload(t->_parent, map);
> >>>> +           return;
> >>>> +   }
> >>>>
> >>>> +   ims = map->_dm_cookie;
> >>>>    is = ims->ims_sb->sb_iommu;
> >>>>
> >>>>    /* Flush the iommu */
> >>>> @@ -1488,7 +1496,7 @@ iommu_dvmamap_print_map(bus_dma_tag_t t,
> >>>>            break;
> >>>>    }
> >>>>
> >>>> -   if (map->_dm_cookie) {
> >>>> +   if (!ISSET(map->_dm_flags, BUS_DMA_64BIT) && map->_dm_cookie !=
> >> NULL) {
> >>>>            struct iommu_map_state *ims = map->_dm_cookie;
> >>>>            struct iommu_page_map *ipm = &ims->ims_map;
> >>>>
> >>>> @@ -1546,19 +1554,19 @@ void
> >>>> iommu_dvmamap_sync(bus_dma_tag_t t, bus_dma_tag_t t0, bus_dmamap_t
> >> map,
> >>>>     bus_addr_t offset, bus_size_t len, int ops)
> >>>> {
> >>>> -   struct iommu_map_state *ims = map->_dm_cookie;
> >>>> +   struct iommu_map_state *ims;
> >>>>
> >>>> -#ifdef DIAGNOSTIC
> >>>> -   if (ims == NULL)
> >>>> -           panic("iommu_dvmamap_sync: null map state");
> >>>> -   if (ims->ims_sb == NULL)
> >>>> -           panic("iommu_dvmamap_sync: null sb");
> >>>> -   if (ims->ims_sb->sb_iommu == NULL)
> >>>> -           panic("iommu_dvmamap_sync: null iommu");
> >>>> -#endif
> >>>>    if (len == 0)
> >>>>            return;
> >>>>
> >>>> +   if (map->_dm_flags & BUS_DMA_64BIT) {
> >>>> +           if (ops & (BUS_DMASYNC_PREWRITE | BUS_DMASYNC_POSTREAD))
> >>>> +                   membar(MemIssue);
> >>>> +           return;
> >>>> +   }
> >>>> +
> >>>> +   ims = map->_dm_cookie;
> >>>> +
> >>>>    if (ops & BUS_DMASYNC_PREWRITE)
> >>>>            membar(MemIssue);
> >>>>
> >>>> @@ -1622,9 +1630,13 @@ iommu_dvmamem_alloc(bus_dma_tag_t t, bus
> >>>>        "bound %llx segp %p flags %d\n", (unsigned long long)size,
> >>>>        (unsigned long long)alignment, (unsigned long long)boundary,
> >>>>        segs, flags));
> >>>> +
> >>>> +   if ((flags & BUS_DMA_64BIT) == 0)
> >>>> +           flags |= BUS_DMA_DVMA;
> >>>> +
> >>>>    BUS_DMA_FIND_PARENT(t, _dmamem_alloc);
> >>>>    return ((*t->_dmamem_alloc)(t, t0, size, alignment, boundary,
> >>>> -       segs, nsegs, rsegs, flags | BUS_DMA_DVMA));
> >>>> +       segs, nsegs, rsegs, flags));
> >>>> }
> >>>>
> >>>> void
> >>>> @@ -1763,7 +1775,7 @@ iommu_iomap_load_map(struct iommu_state
> >>>>
> >>>>            /* Flush cache if necessary. */
> >>>>            slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> >>>> -           if (is->is_flags & IOMMU_FLUSH_CACHE &&
> >>>> +           if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
> >>>>                (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
> >>>>                    IOMMUREG_WRITE(is, iommu_cache_flush,
> >>>>                        is->is_ptsb + slot * 8);
> >>>> @@ -1788,7 +1800,7 @@ iommu_iomap_unload_map(struct iommu_stat
> >>>>
> >>>>            /* Flush cache if necessary. */
> >>>>            slot = IOTSBSLOT(e->ipe_va, is->is_tsbsize);
> >>>> -           if (is->is_flags & IOMMU_FLUSH_CACHE &&
> >>>> +           if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE) &&
> >>>>                (i == (ipm->ipm_pagecnt - 1) || (slot % 8) == 7))
> >>>>                    IOMMUREG_WRITE(is, iommu_cache_flush,
> >>>>                        is->is_ptsb + slot * 8);
> >>>> Index: dev/iommureg.h
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommureg.h,v
> >>>> retrieving revision 1.17
> >>>> diff -u -p -r1.17 iommureg.h
> >>>> --- dev/iommureg.h  17 Aug 2012 20:46:50 -0000      1.17
> >>>> +++ dev/iommureg.h  10 May 2017 12:00:09 -0000
> >>>> @@ -90,10 +90,11 @@ struct iommu_strbuf {
> >>>> #define IOMMUCR_DE         0x000000000000000002LL  /* Diag enable */
> >>>> #define IOMMUCR_EN         0x000000000000000001LL  /* Enable IOMMU */
> >>>>
> >>>> -#define IOMMUCR_FIRE_SE            0x000000000000000400LL  /* Snoop
> >> enable */
> >>>> -#define IOMMUCR_FIRE_CM_EN 0x000000000000000300LL  /* Cache mode
> >> enable */
> >>>> -#define IOMMUCR_FIRE_BE            0x000000000000000002LL  /* Bypass
> >> enable */
> >>>> -#define IOMMUCR_FIRE_TE            0x000000000000000001LL  /*
> >> Translation enabled */
> >>>> +#define IOMMUCR_FIRE_PD            0x000000000000001000UL  /* Process
> >> disable */
> >>>> +#define IOMMUCR_FIRE_SE            0x000000000000000400UL  /* Snoop
> >> enable */
> >>>> +#define IOMMUCR_FIRE_CM_EN 0x000000000000000300UL  /* Cache mode
> >> enable */
> >>>> +#define IOMMUCR_FIRE_BE            0x000000000000000002UL  /* Bypass
> >> enable */
> >>>> +#define IOMMUCR_FIRE_TE            0x000000000000000001UL  /*
> >> Translation enabled */
> >>>>
> >>>> /*
> >>>>  * IOMMU stuff
> >>>> Index: dev/iommuvar.h
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v
> >>>> retrieving revision 1.17
> >>>> diff -u -p -r1.17 iommuvar.h
> >>>> --- dev/iommuvar.h  4 May 2016 18:26:12 -0000       1.17
> >>>> +++ dev/iommuvar.h  10 May 2017 12:00:09 -0000
> >>>> @@ -100,6 +100,21 @@ struct iommu_map_state {
> >>>> };
> >>>> #define IOMMU_MAP_STREAM   1
> >>>>
> >>>> +struct iommu_hw {
> >>>> +   void                    (*ihw_enable)(struct iommu_state *);
> >>>> +
> >>>> +   unsigned long           ihw_dvma_pa;
> >>>> +
> >>>> +   unsigned long           ihw_bypass;
> >>>> +   unsigned long           ihw_bypass_nc;          /* non-cached */
> >>>> +   unsigned long           ihw_bypass_ro;          /* relaxed
> >> ordering */
> >>>> +
> >>>> +   unsigned int            ihw_flags;
> >>>> +#define IOMMU_HW_FLUSH_CACHE               (1 << 0)
> >>>> +};
> >>>> +
> >>>> +extern const struct iommu_hw iommu_hw_default;
> >>>> +
> >>>> /*
> >>>>  * per-IOMMU state
> >>>>  */
> >>>> @@ -112,8 +127,7 @@ struct iommu_state {
> >>>>    int64_t                 is_cr;          /* Control register value
> >> */
> >>>>    struct mutex            is_mtx;
> >>>>    struct extent           *is_dvmamap;    /* DVMA map for this
> >> instance */
> >>>> -   int                     is_flags;
> >>>> -#define IOMMU_FLUSH_CACHE  0x00000001
> >>>> +   const struct iommu_hw   *is_hw;
> >>>>
> >>>>    struct strbuf_ctl       *is_sb[2];      /* Streaming buffers if
> >> any */
> >>>>
> >>>> @@ -126,7 +140,8 @@ struct iommu_state {
> >>>> };
> >>>>
> >>>> /* interfaces for PCI/SBus code */
> >>>> -void       iommu_init(char *, struct iommu_state *, int, u_int32_t);
> >>>> +void       iommu_init(char *, const struct iommu_hw *, struct
> >> iommu_state *,
> >>>> +    int, u_int32_t);
> >>>> void       iommu_reset(struct iommu_state *);
> >>>> paddr_t iommu_extract(struct iommu_state *, bus_addr_t);
> >>>> int64_t iommu_lookup_tte(struct iommu_state *, bus_addr_t);
> >>>> @@ -146,6 +161,7 @@ int     iommu_dvmamem_alloc(bus_dma_tag_t, b
> >>>>        bus_size_t, bus_size_t, bus_dma_segment_t *, int, int *, int);
> >>>> void       iommu_dvmamem_free(bus_dma_tag_t, bus_dma_tag_t,
> >> bus_dma_segment_t *,
> >>>>        int);
> >>>> +
> >>>>
> >>>> #define IOMMUREG_READ(is, reg)                             \
> >>>>    bus_space_read_8((is)->is_bustag,               \
> >>>> Index: dev/pci_machdep.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/pci_machdep.c,v
> >>>> retrieving revision 1.44
> >>>> diff -u -p -r1.44 pci_machdep.c
> >>>> --- dev/pci_machdep.c       10 May 2014 12:15:19 -0000      1.44
> >>>> +++ dev/pci_machdep.c       10 May 2017 12:00:09 -0000
> >>>> @@ -58,6 +58,7 @@ int sparc_pci_debug = 0x0;
> >>>> #include <machine/openfirm.h>
> >>>> #include <dev/pci/pcivar.h>
> >>>> #include <dev/pci/pcireg.h>
> >>>> +#include <dev/pci/pcidevs.h>
> >>>>
> >>>> #include <dev/ofw/ofw_pci.h>
> >>>>
> >>>> @@ -85,6 +86,46 @@ pci_attach_hook(parent, self, pba)
> >>>>    struct pcibus_attach_args *pba;
> >>>> {
> >>>>    /* Don't do anything */
> >>>> +}
> >>>> +
> >>>> +int
> >>>> +pci_bcm_dmamap_create(bus_dma_tag_t dt, bus_dma_tag_t t0, bus_size_t
> >> size,
> >>>> +    int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int
> >> flags,
> >>>> +    bus_dmamap_t *dmamp)
> >>>> +{
> >>>> +   bus_dma_tag_t pdt = dt->_parent;
> >>>> +
> >>>> +   CLR(flags, BUS_DMA_64BIT);
> >>>> +
> >>>> +   return ((*pdt->_dmamap_create)(pdt, t0, size, nsegments, maxsegsz,
> >>>> +       boundary, flags, dmamp));
> >>>> +}
> >>>> +
> >>>> +int
> >>>> +pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args
> >> *pa)
> >>>> +{
> >>>> +   bus_dma_tag_t dt, pdt;
> >>>> +
> >>>> +   if (pa->pa_id ==
> >>>> +       PCI_ID_CODE(PCI_VENDOR_RCC, PCI_PRODUCT_RCC_PCIE_PCIX)) {
> >>>> +           /*
> >>>> +            * These PCI bridges only support 40bit DVA, so intercept
> >>>> +            * bus_dmamap_create so we can clear BUS_DMA_64BIT.
> >>>> +            */
> >>>> +
> >>>> +           dt = malloc(sizeof(*dt), M_DEVBUF, M_NOWAIT | M_ZERO);
> >>>> +           if (dt == NULL)
> >>>> +                   panic("%s: could not alloc dma tag", __func__);
> >>>> +
> >>>> +           pdt = pa->pa_dmat;
> >>>> +
> >>>> +           dt->_parent = pdt;
> >>>> +           dt->_dmamap_create = pci_bcm_dmamap_create;
> >>>> +
> >>>> +           pa->pa_dmat = dt;
> >>>> +   }
> >>>> +
> >>>> +   return (0);
> >>>> }
> >>>>
> >>>> int
> >>>> Index: dev/psycho.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v
> >>>> retrieving revision 1.74
> >>>> diff -u -p -r1.74 psycho.c
> >>>> --- dev/psycho.c    23 Aug 2016 03:28:01 -0000      1.74
> >>>> +++ dev/psycho.c    10 May 2017 12:00:09 -0000
> >>>> @@ -902,7 +902,7 @@ psycho_iommu_init(struct psycho_softc *s
> >>>>            panic("couldn't malloc iommu name");
> >>>>    snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
> >>>>
> >>>> -   iommu_init(name, is, tsbsize, iobase);
> >>>> +   iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
> >>>> }
> >>>>
> >>>> /*
> >>>> Index: dev/pyro.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/pyro.c,v
> >>>> retrieving revision 1.30
> >>>> diff -u -p -r1.30 pyro.c
> >>>> --- dev/pyro.c      20 Dec 2016 13:40:50 -0000      1.30
> >>>> +++ dev/pyro.c      10 May 2017 12:00:09 -0000
> >>>> @@ -131,6 +131,30 @@ int pyro_msi_eq_intr(void *);
> >>>> int pyro_dmamap_create(bus_dma_tag_t, bus_dma_tag_t, bus_size_t, int,
> >>>>     bus_size_t, bus_size_t, int, bus_dmamap_t *);
> >>>>
> >>>> +void pyro_iommu_enable(struct iommu_state *);
> >>>> +
> >>>> +const struct iommu_hw iommu_hw_fire = {
> >>>> +   .ihw_enable     = pyro_iommu_enable,
> >>>> +
> >>>> +   .ihw_dvma_pa    = 0x000007ffffffffffUL,
> >>>> +
> >>>> +   .ihw_bypass     = 0xfffc000000000000UL,
> >>>> +   .ihw_bypass_nc  = 0x0000080000000000UL,
> >>>> +   .ihw_bypass_ro  = 0,
> >>>> +};
> >>>> +
> >>>> +const struct iommu_hw iommu_hw_oberon = {
> >>>> +   .ihw_enable     = pyro_iommu_enable,
> >>>> +
> >>>> +   .ihw_dvma_pa    = 0x00007fffffffffffUL,
> >>>> +
> >>>> +   .ihw_bypass     = 0x7ffc000000000000UL,
> >>>> +   .ihw_bypass_nc  = 0x0000800000000000UL,
> >>>> +   .ihw_bypass_ro  = 0x8000000000000000UL,
> >>>> +
> >>>> +   .ihw_flags      = IOMMU_HW_FLUSH_CACHE,
> >>>> +};
> >>>> +
> >>>> #ifdef DDB
> >>>> void pyro_xir(void *, int);
> >>>> #endif
> >>>> @@ -266,6 +290,7 @@ pyro_init_iommu(struct pyro_softc *sc, s
> >>>>    int tsbsize = 7;
> >>>>    u_int32_t iobase = -1;
> >>>>    char *name;
> >>>> +   const struct iommu_hw *ihw = &iommu_hw_fire;
> >>>>
> >>>>    is->is_bustag = sc->sc_bust;
> >>>>
> >>>> @@ -282,11 +307,23 @@ pyro_init_iommu(struct pyro_softc *sc, s
> >>>>            panic("couldn't malloc iommu name");
> >>>>    snprintf(name, 32, "%s dvma", sc->sc_dv.dv_xname);
> >>>>
> >>>> -   /* On Oberon, we need to flush the cache. */
> >>>>    if (sc->sc_oberon)
> >>>> -           is->is_flags |= IOMMU_FLUSH_CACHE;
> >>>> +           ihw = &iommu_hw_oberon;
> >>>> +
> >>>> +   iommu_init(name, ihw, is, tsbsize, iobase);
> >>>> +}
> >>>> +
> >>>> +void
> >>>> +pyro_iommu_enable(struct iommu_state *is)
> >>>> +{
> >>>> +   unsigned long cr;
> >>>> +
> >>>> +   cr = IOMMUREG_READ(is, iommu_cr);
> >>>> +   cr |= IOMMUCR_FIRE_BE | IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
> >>>> +       IOMMUCR_FIRE_TE;
> >>>>
> >>>> -   iommu_init(name, is, tsbsize, iobase);
> >>>> +   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb | is->is_tsbsize);
> >>>> +   IOMMUREG_WRITE(is, iommu_cr, cr);
> >>>> }
> >>>>
> >>>> void
> >>>> Index: dev/sbus.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v
> >>>> retrieving revision 1.44
> >>>> diff -u -p -r1.44 sbus.c
> >>>> --- dev/sbus.c      19 Sep 2015 21:07:04 -0000      1.44
> >>>> +++ dev/sbus.c      10 May 2017 12:00:09 -0000
> >>>> @@ -349,7 +349,7 @@ sbus_mb_attach(struct device *parent, st
> >>>>    snprintf(name, 32, "%s dvma", sc->sc_dev.dv_xname);
> >>>>
> >>>>    printf("%s: ", sc->sc_dev.dv_xname);
> >>>> -   iommu_init(name, &sc->sc_is, 0, -1);
> >>>> +   iommu_init(name, &iommu_hw_default, &sc->sc_is, 0, -1);
> >>>>
> >>>>    /* Initialize Starfire PC interrupt translation. */
> >>>>    if (OF_getprop(findroot(), "name", buf, sizeof(buf)) > 0 &&
> >>>> Index: dev/schizo.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v
> >>>> retrieving revision 1.67
> >>>> diff -u -p -r1.67 schizo.c
> >>>> --- dev/schizo.c    23 Aug 2016 03:28:01 -0000      1.67
> >>>> +++ dev/schizo.c    10 May 2017 12:00:09 -0000
> >>>> @@ -451,7 +451,7 @@ schizo_init_iommu(struct schizo_softc *s
> >>>>                "using iobase=0x%x, tsbsize=%d\n", iobase, tsbsize));
> >>>>    }
> >>>>
> >>>> -   iommu_init(name, is, tsbsize, iobase);
> >>>> +   iommu_init(name, &iommu_hw_default, is, tsbsize, iobase);
> >>>> }
> >>>>
> >>>> int
> >>>> Index: include/pci_machdep.h
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/arch/sparc64/include/pci_machdep.h,v
> >>>> retrieving revision 1.33
> >>>> diff -u -p -r1.33 pci_machdep.h
> >>>> --- include/pci_machdep.h   4 May 2016 14:30:01 -0000       1.33
> >>>> +++ include/pci_machdep.h   10 May 2017 12:00:09 -0000
> >>>> @@ -74,10 +74,13 @@ struct sparc_pci_chipset {
> >>>>    pcireg_t (*conf_read)(pci_chipset_tag_t, pcitag_t, int);
> >>>>    void (*conf_write)(pci_chipset_tag_t, pcitag_t, int, pcireg_t);
> >>>>    int (*intr_map)(struct pci_attach_args *, pci_intr_handle_t *);
> >>>> +   int (*probe_device_hook)(void *, struct pci_attach_args *);
> >>>> };
> >>>>
> >>>> void               pci_attach_hook(struct device *, struct device *,
> >>>>                                 struct pcibus_attach_args *);
> >>>> +int                pci_probe_device_hook(pci_chipset_tag_t,
> >>>> +               struct pci_attach_args *);
> >>>> int                pci_bus_maxdevs(pci_chipset_tag_t, int);
> >>>> pcitag_t   pci_make_tag(pci_chipset_tag_t, int, int, int);
> >>>> void               pci_decompose_tag(pci_chipset_tag_t, pcitag_t, int
> >> *, int *,
> >>>> @@ -102,8 +105,6 @@ int             sparc64_pci_enumerate_bus(struct
> pc
> >>>>                struct pci_attach_args *);
> >>>>
> >>>> #define PCI_MACHDEP_ENUMERATE_BUS sparc64_pci_enumerate_bus
> >>>> -
> >>>> -#define    pci_probe_device_hook(c, a)     (0)
> >>>>
> >>>> #define    pci_min_powerstate(c, t)        (PCI_PMCSR_STATE_D3)
> >>>> #define    pci_set_powerstate_md(c, t, s, p)
> >>>
> >>>
> >>
> >>
>
>
Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Theo de Raadt-2
Andrew Grillet <[hidden email]> wrote:

> These days we are not so short of memory - would it not be possible to
> allocate an mbuf (or two for double-buffered) for each file
> when opened, and free when closed?

What does this have to do with files??  

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Andrew Grillet
I was assuming that the main objection to allocating mbufs for duration of
file open,
rather than allocating per transfer, this could result in a much higher
number of mbufs
being in use concurrently. I cannot see any other downside (which may be
due to my
not understanding a lot of stuff - I last wrote this level of stuff for
Unix in the 1980's).

On Sat, 20 Oct 2018 at 14:41, Theo de Raadt <[hidden email]> wrote:

> Andrew Grillet <[hidden email]> wrote:
>
> > These days we are not so short of memory - would it not be possible to
> > allocate an mbuf (or two for double-buffered) for each file
> > when opened, and free when closed?
>
> What does this have to do with files??
>
Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Theo de Raadt-2
In this case, what do mbufs have to do with files?

I am very confused.

> I was assuming that the main objection to allocating mbufs for duration of
> file open,
> rather than allocating per transfer, this could result in a much higher
> number of mbufs
> being in use concurrently. I cannot see any other downside (which may be
> due to my
> not understanding a lot of stuff - I last wrote this level of stuff for
> Unix in the 1980's).
>
> On Sat, 20 Oct 2018 at 14:41, Theo de Raadt <[hidden email]> wrote:
>
> > Andrew Grillet <[hidden email]> wrote:
> >
> > > These days we are not so short of memory - would it not be possible to
> > > allocate an mbuf (or two for double-buffered) for each file
> > > when opened, and free when closed?
> >
> > What does this have to do with files??
> >

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Andrew Grillet
Ok, what I am proposing is that the IOMMU is set up when a file is opened
to provide the address space required for that file's IO.
This remains set up until the file is closed, avoiding frequent set-up and
tear-down for each IO transfer.

I assume that there is sufficient IOMMU address space to handle any
plausible number of files open, and that it is possible to keep
the knowledge of address spaces private to the Primary Ldom, and guests
would only be aware of the mbufs visible to them, and
this is acceptable. (If you cant trust the Primary, I rather suspect you
are stuffed anyway). Clearly, dependent of IOMMU architecture,
which I do not claim to understand, this could exhaust IO address space
before it exhausts physical memory, I don't know.
But I cannot see any other reason why this would not avoid frequent set-up
and tear-downs.

I get the impression that disk access is not great on my Txxxx machines. I
expect a 1GHz T1000 to totally piss on a 4GHz Intel
machine at web serving, and it doesn't. (Solaris annoys me too much to even
try it, but I assume its better than OpenBSD on
Spact64 at this time, or Larry Ellison would have to sell his yacht).





On Sat, 20 Oct 2018 at 20:04, Theo de Raadt <[hidden email]> wrote:

> In this case, what do mbufs have to do with files?
>
> I am very confused.
>
> > I was assuming that the main objection to allocating mbufs for duration
> of
> > file open,
> > rather than allocating per transfer, this could result in a much higher
> > number of mbufs
> > being in use concurrently. I cannot see any other downside (which may be
> > due to my
> > not understanding a lot of stuff - I last wrote this level of stuff for
> > Unix in the 1980's).
> >
> > On Sat, 20 Oct 2018 at 14:41, Theo de Raadt <[hidden email]> wrote:
> >
> > > Andrew Grillet <[hidden email]> wrote:
> > >
> > > > These days we are not so short of memory - would it not be possible
> to
> > > > allocate an mbuf (or two for double-buffered) for each file
> > > > when opened, and free when closed?
> > >
> > > What does this have to do with files??
> > >
>
Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Theo de Raadt-2
Andrew Grillet <[hidden email]> wrote:

> Ok, what I am proposing is that the IOMMU is set up when a file is opened
> to provide the address space required for that file's IO.

Wow, you keep saying file as if it means something.

packets off the network are not associated with any specific "file"
activity

it isn't how the kernel works.

You are ... way off target.

Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Andrew Grillet
So, substitute opening and closing the connection to the network?

Is the IOMMU not used for disk (and all SCSI) access also?



On Sat, 20 Oct 2018 at 20:32, Theo de Raadt <[hidden email]> wrote:

> Andrew Grillet <[hidden email]> wrote:
>
> > Ok, what I am proposing is that the IOMMU is set up when a file is opened
> > to provide the address space required for that file's IO.
>
> Wow, you keep saying file as if it means something.
>
> packets off the network are not associated with any specific "file"
> activity
>
> it isn't how the kernel works.
>
> You are ... way off target.
>
Reply | Threaded
Open this post in threaded view
|

Re: bypass support for iommu on sparc64

Bryan Steele-2
This is OpenBSD tech@

On Sat, Oct 20, 2018 at 08:36:33PM +0100, Andrew Grillet wrote:

> So, substitute opening and closing the connection to the network?
>
> Is the IOMMU not used for disk (and all SCSI) access also?
>
>
>
> On Sat, 20 Oct 2018 at 20:32, Theo de Raadt <[hidden email]> wrote:
>
> > Andrew Grillet <[hidden email]> wrote:
> >
> > > Ok, what I am proposing is that the IOMMU is set up when a file is opened
> > > to provide the address space required for that file's IO.
> >
> > Wow, you keep saying file as if it means something.
> >
> > packets off the network are not associated with any specific "file"
> > activity
> >
> > it isn't how the kernel works.
> >
> > You are ... way off target.
> >
>