arm64 pmap tlb flushing diff

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

arm64 pmap tlb flushing diff

Mark Kettenis
While reading the arm64 code the ranges covered by the tlb flushes in
the code that sets the l1/l2/l3 tables made no sense to me.  I came to
the conclusion that these flushes aren't necessary.  When we enter the
final page table entry, we flush the tlb again which flushes any
cached information for the l1/l2/l3 tables anyway.  This means the
code that flushes ranges can go and arm64_tlbi_asid() can be folded
into the ttlb_flush() function.  One additional fix there.  My
previous changes made the asid == -1 branch unreachable.  I think that
means that page table entries marked as global are actually flushed
even if the flush targets a specific ASID.  But the architecture
manual isn't very explicit about this.  So I reactivated the branch by
changing the test to pm == pmap_kernel().

In the end this doesn't change the performance in a measurable way, but less code is always better isn't it?

ok?


Index: arch/arm64/arm64/pmap.c
===================================================================
RCS file: /cvs/src/sys/arch/arm64/arm64/pmap.c,v
retrieving revision 1.32
diff -u -p -r1.32 pmap.c
--- arch/arm64/arm64/pmap.c 13 Apr 2017 20:48:29 -0000 1.32
+++ arch/arm64/arm64/pmap.c 14 Apr 2017 09:27:22 -0000
@@ -35,7 +35,6 @@
 #include <ddb/db_output.h>
 
 void pmap_setttb(struct proc *p);
-void arm64_tlbi_asid(vaddr_t va, int asid);
 void pmap_free_asid(pmap_t pm);
 
 /*
@@ -68,34 +67,13 @@ pmap_pa_is_mem(uint64_t pa)
 static inline void
 ttlb_flush(pmap_t pm, vaddr_t va)
 {
- arm64_tlbi_asid(va, pm->pm_asid);
-}
-
-static inline void
-ttlb_flush_range(pmap_t pm, vaddr_t va, vsize_t size)
-{
- vaddr_t eva = va + size;
-
- /* if size is over 512 pages, just flush the entire cache !?!?! */
- if (size >= (512 * PAGE_SIZE)) {
- cpu_tlb_flush();
- return;
- }
-
- for ( ; va < eva; va += PAGE_SIZE)
- arm64_tlbi_asid(va, pm->pm_asid);
-}
-
-void
-arm64_tlbi_asid(vaddr_t va, int asid)
-{
  vaddr_t resva;
 
  resva = ((va >> PAGE_SHIFT) & ((1ULL << 44) - 1));
- if (asid == -1) {
+ if (pm == pmap_kernel()) {
  cpu_tlb_flush_all_asid(resva);
  } else {
- resva |= (unsigned long long)asid << 48;
+ resva |= (uint64_t)pm->pm_asid << 48;
  cpu_tlb_flush_asid(resva);
  }
 }
@@ -1267,8 +1245,6 @@ pmap_set_l1(struct pmap *pm, uint64_t va
  idx0 = VP_IDX0(va);
  pm->pm_vp.l0->vp[idx0] = l1_va;
  pm->pm_vp.l0->l0[idx0] = pg_entry;
-
- ttlb_flush_range(pm, va & ~PAGE_MASK, 1<<VP_IDX1_POS);
 }
 
 void
@@ -1299,8 +1275,6 @@ pmap_set_l2(struct pmap *pm, uint64_t va
  vp1 = pm->pm_vp.l1;
  vp1->vp[idx1] = l2_va;
  vp1->l1[idx1] = pg_entry;
-
- ttlb_flush_range(pm, va & ~PAGE_MASK, 1<<VP_IDX2_POS);
 }
 
 void
@@ -1334,8 +1308,6 @@ pmap_set_l3(struct pmap *pm, uint64_t va
  vp2 = vp1->vp[idx1];
  vp2->vp[idx2] = l3_va;
  vp2->l2[idx2] = pg_entry;
-
- ttlb_flush_range(pm, va & ~PAGE_MASK, 1<<VP_IDX3_POS);
 }
 
 /*
@@ -1578,7 +1550,7 @@ pmap_pte_remove(struct pte_desc *pted, i
  if (remove_pted)
  vp3->vp[VP_IDX3(pted->pted_va)] = NULL;
 
- arm64_tlbi_asid(pted->pted_va, pm->pm_asid);
+ ttlb_flush(pm, pted->pted_va);
 }
 
 /*

Reply | Threaded
Open this post in threaded view
|

Re: arm64 pmap tlb flushing diff

Patrick Wildt-3
On Fri, Apr 14, 2017 at 11:51:52AM +0200, Mark Kettenis wrote:

> While reading the arm64 code the ranges covered by the tlb flushes in
> the code that sets the l1/l2/l3 tables made no sense to me.  I came to
> the conclusion that these flushes aren't necessary.  When we enter the
> final page table entry, we flush the tlb again which flushes any
> cached information for the l1/l2/l3 tables anyway.  This means the
> code that flushes ranges can go and arm64_tlbi_asid() can be folded
> into the ttlb_flush() function.  One additional fix there.  My
> previous changes made the asid == -1 branch unreachable.  I think that
> means that page table entries marked as global are actually flushed
> even if the flush targets a specific ASID.  But the architecture
> manual isn't very explicit about this.  So I reactivated the branch by
> changing the test to pm == pmap_kernel().
>
> In the end this doesn't change the performance in a measurable way, but less code is always better isn't it?
>
> ok?
>
>
> Index: arch/arm64/arm64/pmap.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/arm64/arm64/pmap.c,v
> retrieving revision 1.32
> diff -u -p -r1.32 pmap.c
> --- arch/arm64/arm64/pmap.c 13 Apr 2017 20:48:29 -0000 1.32
> +++ arch/arm64/arm64/pmap.c 14 Apr 2017 09:27:22 -0000
> @@ -35,7 +35,6 @@
>  #include <ddb/db_output.h>
>  
>  void pmap_setttb(struct proc *p);
> -void arm64_tlbi_asid(vaddr_t va, int asid);
>  void pmap_free_asid(pmap_t pm);
>  
>  /*
> @@ -68,34 +67,13 @@ pmap_pa_is_mem(uint64_t pa)
>  static inline void
>  ttlb_flush(pmap_t pm, vaddr_t va)
>  {
> - arm64_tlbi_asid(va, pm->pm_asid);
> -}
> -
> -static inline void
> -ttlb_flush_range(pmap_t pm, vaddr_t va, vsize_t size)
> -{
> - vaddr_t eva = va + size;
> -
> - /* if size is over 512 pages, just flush the entire cache !?!?! */
> - if (size >= (512 * PAGE_SIZE)) {
> - cpu_tlb_flush();
> - return;
> - }
> -
> - for ( ; va < eva; va += PAGE_SIZE)
> - arm64_tlbi_asid(va, pm->pm_asid);
> -}
> -
> -void
> -arm64_tlbi_asid(vaddr_t va, int asid)
> -{
>   vaddr_t resva;
>  
>   resva = ((va >> PAGE_SHIFT) & ((1ULL << 44) - 1));
> - if (asid == -1) {
> + if (pm == pmap_kernel()) {
>   cpu_tlb_flush_all_asid(resva);
>   } else {
> - resva |= (unsigned long long)asid << 48;
> + resva |= (uint64_t)pm->pm_asid << 48;
>   cpu_tlb_flush_asid(resva);
>   }
>  }
> @@ -1267,8 +1245,6 @@ pmap_set_l1(struct pmap *pm, uint64_t va
>   idx0 = VP_IDX0(va);
>   pm->pm_vp.l0->vp[idx0] = l1_va;
>   pm->pm_vp.l0->l0[idx0] = pg_entry;
> -
> - ttlb_flush_range(pm, va & ~PAGE_MASK, 1<<VP_IDX1_POS);
>  }
>  
>  void
> @@ -1299,8 +1275,6 @@ pmap_set_l2(struct pmap *pm, uint64_t va
>   vp1 = pm->pm_vp.l1;
>   vp1->vp[idx1] = l2_va;
>   vp1->l1[idx1] = pg_entry;
> -
> - ttlb_flush_range(pm, va & ~PAGE_MASK, 1<<VP_IDX2_POS);
>  }
>  
>  void
> @@ -1334,8 +1308,6 @@ pmap_set_l3(struct pmap *pm, uint64_t va
>   vp2 = vp1->vp[idx1];
>   vp2->vp[idx2] = l3_va;
>   vp2->l2[idx2] = pg_entry;
> -
> - ttlb_flush_range(pm, va & ~PAGE_MASK, 1<<VP_IDX3_POS);
>  }
>  
>  /*
> @@ -1578,7 +1550,7 @@ pmap_pte_remove(struct pte_desc *pted, i
>   if (remove_pted)
>   vp3->vp[VP_IDX3(pted->pted_va)] = NULL;
>  
> - arm64_tlbi_asid(pted->pted_va, pm->pm_asid);
> + ttlb_flush(pm, pted->pted_va);

Other callers pass pted->pted_va & ~PAGE_MASK, but since ttlb_flush()
is doing va >> PAGE_SHIFT this probably isn't needed anyway.  Might
make sense to remove those from the other calls as well.  Afaik it
was simply for being explicit that we should pass a VA and skip its
attributes stored in the lower bits.

Diff doesn't make it any worse on my Seattle, ok patrick@.

Patrick

>  }
>  
>  /*
>