1 #ifndef _I386_PGTABLE_H 2 #define _I386_PGTABLE_H 3 4 5 /* 6 * The Linux memory management assumes a three-level page table setup. On 7 * the i386, we use that, but "fold" the mid level into the top-level page 8 * table, so that we physically have the same two-level page table as the 9 * i386 mmu expects. 10 * 11 * This file contains the functions and defines necessary to modify and use 12 * the i386 page table tree. 13 */ 14 #ifndef __ASSEMBLY__ 15 #include <asm/processor.h> 16 #include <asm/fixmap.h> 17 #include <linux/threads.h> 18 #include <asm/paravirt.h> 19 20 #include <linux/bitops.h> 21 #include <linux/slab.h> 22 #include <linux/list.h> 23 #include <linux/spinlock.h> 24 25 struct mm_struct; 26 struct vm_area_struct; 27 28 /* 29 * ZERO_PAGE is a global shared page that is always zero: used 30 * for zero-mapped memory areas etc.. 31 */ 32 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) 33 extern unsigned long empty_zero_page[1024]; 34 extern pgd_t swapper_pg_dir[1024]; 35 extern struct kmem_cache *pmd_cache; 36 extern spinlock_t pgd_lock; 37 extern struct page *pgd_list; 38 void check_pgt_cache(void); 39 40 void pmd_ctor(struct kmem_cache *, void *); 41 void pgtable_cache_init(void); 42 void paging_init(void); 43 44 45 /* 46 * The Linux x86 paging architecture is 'compile-time dual-mode', it 47 * implements both the traditional 2-level x86 page tables and the 48 * newer 3-level PAE-mode page tables. 49 */ 50 #ifdef CONFIG_X86_PAE 51 # include <asm/pgtable-3level-defs.h> 52 # define PMD_SIZE (1UL << PMD_SHIFT) 53 # define PMD_MASK (~(PMD_SIZE-1)) 54 #else 55 # include <asm/pgtable-2level-defs.h> 56 #endif 57 58 #define PGDIR_SIZE (1UL << PGDIR_SHIFT) 59 #define PGDIR_MASK (~(PGDIR_SIZE-1)) 60 61 #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 62 #define FIRST_USER_ADDRESS 0 63 64 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) 65 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) 66 67 #define TWOLEVEL_PGDIR_SHIFT 22 68 #define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) 69 #define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) 70 71 /* Just any arbitrary offset to the start of the vmalloc VM area: the 72 * current 8MB value just means that there will be a 8MB "hole" after the 73 * physical memory until the kernel virtual memory starts. That means that 74 * any out-of-bounds memory accesses will hopefully be caught. 75 * The vmalloc() routines leaves a hole of 4kB between each vmalloced 76 * area for the same reason. ;) 77 */ 78 #define VMALLOC_OFFSET (8*1024*1024) 79 #define VMALLOC_START (((unsigned long) high_memory + \ 80 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) 81 #ifdef CONFIG_HIGHMEM 82 # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) 83 #else 84 # define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) 85 #endif 86 87 /* 88 * _PAGE_PSE set in the page directory entry just means that 89 * the page directory entry points directly to a 4MB-aligned block of 90 * memory. 91 */ 92 #define _PAGE_BIT_PRESENT 0 93 #define _PAGE_BIT_RW 1 94 #define _PAGE_BIT_USER 2 95 #define _PAGE_BIT_PWT 3 96 #define _PAGE_BIT_PCD 4 97 #define _PAGE_BIT_ACCESSED 5 98 #define _PAGE_BIT_DIRTY 6 99 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ 100 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 101 #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 102 #define _PAGE_BIT_UNUSED2 10 103 #define _PAGE_BIT_UNUSED3 11 104 #define _PAGE_BIT_NX 63 105 106 #define _PAGE_PRESENT 0x001 107 #define _PAGE_RW 0x002 108 #define _PAGE_USER 0x004 109 #define _PAGE_PWT 0x008 110 #define _PAGE_PCD 0x010 111 #define _PAGE_ACCESSED 0x020 112 #define _PAGE_DIRTY 0x040 113 #define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ 114 #define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ 115 #define _PAGE_UNUSED1 0x200 /* available for programmer */ 116 #define _PAGE_UNUSED2 0x400 117 #define _PAGE_UNUSED3 0x800 118 119 /* If _PAGE_PRESENT is clear, we use these: */ 120 #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ 121 #define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; 122 pte_present gives true */ 123 #ifdef CONFIG_X86_PAE 124 #define _PAGE_NX (1ULL<<_PAGE_BIT_NX) 125 #else 126 #define _PAGE_NX 0 127 #endif 128 129 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) 130 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 131 #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) 132 133 #define PAGE_NONE \ 134 __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 135 #define PAGE_SHARED \ 136 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) 137 138 #define PAGE_SHARED_EXEC \ 139 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) 140 #define PAGE_COPY_NOEXEC \ 141 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) 142 #define PAGE_COPY_EXEC \ 143 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) 144 #define PAGE_COPY \ 145 PAGE_COPY_NOEXEC 146 #define PAGE_READONLY \ 147 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) 148 #define PAGE_READONLY_EXEC \ 149 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) 150 151 #define _PAGE_KERNEL \ 152 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) 153 #define _PAGE_KERNEL_EXEC \ 154 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) 155 156 extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; 157 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) 158 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) 159 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) 160 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 161 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 162 163 #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) 164 #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) 165 #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) 166 #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) 167 #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) 168 #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) 169 #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 170 171 /* 172 * The i386 can't do page protection for execute, and considers that 173 * the same are read. Also, write permissions imply read permissions. 174 * This is the closest we can get.. 175 */ 176 #define __P000 PAGE_NONE 177 #define __P001 PAGE_READONLY 178 #define __P010 PAGE_COPY 179 #define __P011 PAGE_COPY 180 #define __P100 PAGE_READONLY_EXEC 181 #define __P101 PAGE_READONLY_EXEC 182 #define __P110 PAGE_COPY_EXEC 183 #define __P111 PAGE_COPY_EXEC 184 185 #define __S000 PAGE_NONE 186 #define __S001 PAGE_READONLY 187 #define __S010 PAGE_SHARED 188 #define __S011 PAGE_SHARED 189 #define __S100 PAGE_READONLY_EXEC 190 #define __S101 PAGE_READONLY_EXEC 191 #define __S110 PAGE_SHARED_EXEC 192 #define __S111 PAGE_SHARED_EXEC 193 194 /* 195 * Define this if things work differently on an i386 and an i486: 196 * it will (on an i486) warn about kernel memory accesses that are 197 * done without a 'access_ok(VERIFY_WRITE,..)' 198 */ 199 #undef TEST_ACCESS_OK 200 201 /* The boot page tables (all created as a single array) */ 202 extern unsigned long pg0[]; 203 204 #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) 205 206 /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ 207 #define pmd_none(x) (!(unsigned long)pmd_val(x)) 208 #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) 209 #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) 210 211 212 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) 213 214 /* 215 * The following only work if pte_present() is true. 216 * Undefined behaviour if not.. 217 */ 218 static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } 219 static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } 220 static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } 221 static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } 222 223 /* 224 * The following only works if pte_present() is not true. 225 */ 226 static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } 227 228 static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } 229 static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } 230 static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } 231 static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } 232 static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } 233 static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } 234 static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } 235 236 #ifdef CONFIG_X86_PAE 237 # include <asm/pgtable-3level.h> 238 #else 239 # include <asm/pgtable-2level.h> 240 #endif 241 242 #ifndef CONFIG_PARAVIRT 243 /* 244 * Rules for using pte_update - it must be called after any PTE update which 245 * has not been done using the set_pte / clear_pte interfaces. It is used by 246 * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE 247 * updates should either be sets, clears, or set_pte_atomic for P->P 248 * transitions, which means this hook should only be called for user PTEs. 249 * This hook implies a P->P protection or access change has taken place, which 250 * requires a subsequent TLB flush. The notification can optionally be delayed 251 * until the TLB flush event by using the pte_update_defer form of the 252 * interface, but care must be taken to assure that the flush happens while 253 * still holding the same page table lock so that the shadow and primary pages 254 * do not become out of sync on SMP. 255 */ 256 #define pte_update(mm, addr, ptep) do { } while (0) 257 #define pte_update_defer(mm, addr, ptep) do { } while (0) 258 #endif 259 260 /* local pte updates need not use xchg for locking */ 261 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 262 { 263 pte_t res = *ptep; 264 265 /* Pure native function needs no input for mm, addr */ 266 native_pte_clear(NULL, 0, ptep); 267 return res; 268 } 269 270 /* 271 * We only update the dirty/accessed state if we set 272 * the dirty bit by hand in the kernel, since the hardware 273 * will do the accessed bit for us, and we don't want to 274 * race with other CPU's that might be updating the dirty 275 * bit at the same time. 276 */ 277 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 278 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ 279 ({ \ 280 int __changed = !pte_same(*(ptep), entry); \ 281 if (__changed && dirty) { \ 282 (ptep)->pte_low = (entry).pte_low; \ 283 pte_update_defer((vma)->vm_mm, (address), (ptep)); \ 284 flush_tlb_page(vma, address); \ 285 } \ 286 __changed; \ 287 }) 288 289 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 290 #define ptep_test_and_clear_young(vma, addr, ptep) ({ \ 291 int __ret = 0; \ 292 if (pte_young(*(ptep))) \ 293 __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ 294 &(ptep)->pte_low); \ 295 if (__ret) \ 296 pte_update((vma)->vm_mm, addr, ptep); \ 297 __ret; \ 298 }) 299 300 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 301 #define ptep_clear_flush_young(vma, address, ptep) \ 302 ({ \ 303 int __young; \ 304 __young = ptep_test_and_clear_young((vma), (address), (ptep)); \ 305 if (__young) \ 306 flush_tlb_page(vma, address); \ 307 __young; \ 308 }) 309 310 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 311 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 312 { 313 pte_t pte = native_ptep_get_and_clear(ptep); 314 pte_update(mm, addr, ptep); 315 return pte; 316 } 317 318 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL 319 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) 320 { 321 pte_t pte; 322 if (full) { 323 /* 324 * Full address destruction in progress; paravirt does not 325 * care about updates and native needs no locking 326 */ 327 pte = native_local_ptep_get_and_clear(ptep); 328 } else { 329 pte = ptep_get_and_clear(mm, addr, ptep); 330 } 331 return pte; 332 } 333 334 #define __HAVE_ARCH_PTEP_SET_WRPROTECT 335 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 336 { 337 clear_bit(_PAGE_BIT_RW, &ptep->pte_low); 338 pte_update(mm, addr, ptep); 339 } 340 341 /* 342 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 343 * 344 * dst - pointer to pgd range anwhere on a pgd page 345 * src - "" 346 * count - the number of pgds to copy. 347 * 348 * dst and src can be on the same page, but the range must not overlap, 349 * and must not cross a page boundary. 350 */ 351 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) 352 { 353 memcpy(dst, src, count * sizeof(pgd_t)); 354 } 355 356 /* 357 * Macro to mark a page protection value as "uncacheable". On processors which do not support 358 * it, this is a no-op. 359 */ 360 #define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ 361 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) 362 363 /* 364 * Conversion functions: convert a page and protection to a page entry, 365 * and a page entry and page directory to the page they refer to. 366 */ 367 368 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) 369 370 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 371 { 372 pte.pte_low &= _PAGE_CHG_MASK; 373 pte.pte_low |= pgprot_val(newprot); 374 #ifdef CONFIG_X86_PAE 375 /* 376 * Chop off the NX bit (if present), and add the NX portion of 377 * the newprot (if present): 378 */ 379 pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); 380 pte.pte_high |= (pgprot_val(newprot) >> 32) & \ 381 (__supported_pte_mask >> 32); 382 #endif 383 return pte; 384 } 385 386 #define pmd_large(pmd) \ 387 ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) 388 389 /* 390 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] 391 * 392 * this macro returns the index of the entry in the pgd page which would 393 * control the given virtual address 394 */ 395 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) 396 #define pgd_index_k(addr) pgd_index(addr) 397 398 /* 399 * pgd_offset() returns a (pgd_t *) 400 * pgd_index() is used get the offset into the pgd page's array of pgd_t's; 401 */ 402 #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) 403 404 /* 405 * a shortcut which implies the use of the kernel's pgd, instead 406 * of a process's 407 */ 408 #define pgd_offset_k(address) pgd_offset(&init_mm, address) 409 410 /* 411 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 412 * 413 * this macro returns the index of the entry in the pmd page which would 414 * control the given virtual address 415 */ 416 #define pmd_index(address) \ 417 (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) 418 419 /* 420 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] 421 * 422 * this macro returns the index of the entry in the pte page which would 423 * control the given virtual address 424 */ 425 #define pte_index(address) \ 426 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) 427 #define pte_offset_kernel(dir, address) \ 428 ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) 429 430 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) 431 432 #define pmd_page_vaddr(pmd) \ 433 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) 434 435 /* 436 * Helper function that returns the kernel pagetable entry controlling 437 * the virtual address 'address'. NULL means no pagetable entry present. 438 * NOTE: the return type is pte_t but if the pmd is PSE then we return it 439 * as a pte too. 440 */ 441 extern pte_t *lookup_address(unsigned long address); 442 443 /* 444 * Make a given kernel text page executable/non-executable. 445 * Returns the previous executability setting of that page (which 446 * is used to restore the previous state). Used by the SMP bootup code. 447 * NOTE: this is an __init function for security reasons. 448 */ 449 #ifdef CONFIG_X86_PAE 450 extern int set_kernel_exec(unsigned long vaddr, int enable); 451 #else 452 static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} 453 #endif 454 455 #if defined(CONFIG_HIGHPTE) 456 #define pte_offset_map(dir, address) \ 457 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) 458 #define pte_offset_map_nested(dir, address) \ 459 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) 460 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) 461 #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) 462 #else 463 #define pte_offset_map(dir, address) \ 464 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) 465 #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) 466 #define pte_unmap(pte) do { } while (0) 467 #define pte_unmap_nested(pte) do { } while (0) 468 #endif 469 470 /* Clear a kernel PTE and flush it from the TLB */ 471 #define kpte_clear_flush(ptep, vaddr) \ 472 do { \ 473 pte_clear(&init_mm, vaddr, ptep); \ 474 __flush_tlb_one(vaddr); \ 475 } while (0) 476 477 /* 478 * The i386 doesn't have any external MMU info: the kernel page 479 * tables contain all the necessary information. 480 */ 481 #define update_mmu_cache(vma,address,pte) do { } while (0) 482 483 void native_pagetable_setup_start(pgd_t *base); 484 void native_pagetable_setup_done(pgd_t *base); 485 486 #ifndef CONFIG_PARAVIRT 487 static inline void paravirt_pagetable_setup_start(pgd_t *base) 488 { 489 native_pagetable_setup_start(base); 490 } 491 492 static inline void paravirt_pagetable_setup_done(pgd_t *base) 493 { 494 native_pagetable_setup_done(base); 495 } 496 #endif /* !CONFIG_PARAVIRT */ 497 498 #endif /* !__ASSEMBLY__ */ 499 500 #ifdef CONFIG_FLATMEM 501 #define kern_addr_valid(addr) (1) 502 #endif /* CONFIG_FLATMEM */ 503 504 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ 505 remap_pfn_range(vma, vaddr, pfn, size, prot) 506 507 #include <asm-generic/pgtable.h> 508 509 #endif /* _I386_PGTABLE_H */ 510