前面已經分析了內核頁表的準備工作以及內核低端內存頁表的建立,接著回到init_mem_mapping()中,低端內存頁表建立後緊隨著還有一個函數early_ioremap_page_table_range_init():
【file:/arch/x86/mm/init.c】
/*
* Build a proper pagetable for the kernel mappings. Up until this
* point, we've been running on some set of pagetables constructed by
* the boot process.
*
* If we're booting on native hardware, this will be a pagetable
* constructed in arch/x86/kernel/head_32.S. The root of the
* pagetable will be swapper_pg_dir.
*
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
* or may not be based in swapper_pg_dir. In any case,
* paravirt_pagetable_init() will set up swapper_pg_dir
* appropriately for the rest of the initialization to work.
*
* In general, pagetable_init() assumes that the pagetable may already
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
void __init early_ioremap_page_table_range_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
early_ioremap_reset();
}
該函數主要是用於建立固定內存映射區的。固定內存映射區是指FIXADDR_START到FIXADDR_TOP的地址空間,而該地址空間因功能特性不同通過索引來定義區分,其中索引以枚舉類型的形式定義在enum fixed_addresses裡面。
【file:/arch/x86/include/asm/fixmap.h】
/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
* compile time, but to set the physical address only
* in the boot process.
* for x86_32: We allocate these special addresses
* from the end of virtual memory (0xfffff000) backwards.
* Also this lets us do fail-safe vmalloc(), we
* can guarantee that these special addresses and
* vmalloc()-ed addresses never overlap.
*
* These 'compile-time allocated' memory buffers are
* fixed-size 4k pages (or larger if used with an increment
* higher than 1). Use set_fixmap(idx,phys) to associate
* physical memory with fixmap indices.
*
* TLB entries of such buffers will not be flushed across
* task switches.
*/
enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_HOLE,
FIX_VDSO,
#else
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
VVAR_PAGE,
VSYSCALL_HPET,
#ifdef CONFIG_PARAVIRT_CLOCK
PVCLOCK_FIXMAP_BEGIN,
PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
#endif
#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
#endif
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
#ifdef CONFIG_X86_IO_APIC
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
#ifdef CONFIG_X86_VISWS_APIC
FIX_CO_CPU, /* Cobalt timer */
FIX_CO_APIC, /* Cobalt APIC Redirection Table */
FIX_LI_PCIA, /* Lithium PCI Bridge A */
FIX_LI_PCIB, /* Lithium PCI Bridge B */
#endif
FIX_RO_IDT, /* Virtual mapping for read-only IDT */
#ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
#ifdef CONFIG_PCI_MMCONFIG
FIX_PCIE_MCFG,
#endif
#endif
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
__end_of_permanent_fixed_addresses,
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
* If necessary we round it up to the next 256 pages boundary so
* that we can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
#define FIX_BTMAPS_SLOTS 4
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
FIX_BTMAP_END =
(__end_of_permanent_fixed_addresses ^
(__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
-PTRS_PER_PTE
? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
(__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
: __end_of_permanent_fixed_addresses,
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
#ifdef CONFIG_X86_32
FIX_WP_TEST,
#endif
#ifdef CONFIG_INTEL_TXT
FIX_TBOOT_BASE,
#endif
__end_of_fixed_addresses
};
但是各枚舉標識的分區並不是從低地址往高地址分布,而是自高地址往低地址分布。其中__fix_to_virt宏定義就是用來通過索引來計算相應的固定映射區域的線性地址。
#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
對應的有虛擬地址轉索引的宏:
#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
接著回到early_ioremap_page_table_range_init()的第一個函數page_table_range_init():
【file:/arch/x86/mm/init_32.c】
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
* the given range.
*
* NOTE: The pagetables are allocated contiguous on the physical space
* so we can cache the place of the first one and move around without
* checking the pgd every time.
*/
static void __init
page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
{
int pgd_idx, pmd_idx;
unsigned long vaddr;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
unsigned long count = page_table_range_init_count(start, end);
void *adr = NULL;
if (count)
adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
pmd_idx = pmd_index(vaddr);
pgd = pgd_base + pgd_idx;
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
pmd_idx = 0;
}
}
page_table_range_init_count()用來計算指臨時內核映射區間的頁表數量。前面提到FIXADDR_START到FIXADDR_TOP是固定映射區,其間有多個索引標識不同功能的映射區間,其中的一個區間FIX_KMAP_BEGIN到FIX_KMAP_END是臨時內核映射區。順便可以看一下兩者的定義:
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
其中KM_TYPE_NR表示「窗口」數量,在高端內存的任意一個頁框都可以通過一個「窗口」映射到內核地址空間,調用kmap_atomic可以搭建起「窗口」到高端內存的關係,即建立臨時內核映射。而NR_CPUS則表示CPU數量。總的來說就是該臨時內核映射區間是為了給各個CPU準備一個指定的窗口空間。由於kmap_atomic()對該區間的使用,所以該區間必須保證其頁表連續性。
如果頁全局目錄數不為0的時候,緊接著page_table_range_init_count()的是alloc_low_pages():
【file:/arch/x86/mm/init.c】
/*
* Pages returned are already directly mapped.
*
* Changing that is likely to break Xen, see commit:
*
* 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
*
* for detailed information.
*/
__ref void *alloc_low_pages(unsigned int num)
{
unsigned long pfn;
int i;
if (after_bootmem) {
unsigned int order;
order = get_order((unsigned long)num << PAGE_SHIFT);
return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
__GFP_ZERO, order);
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
panic("alloc_low_pages: ran out of memory");
ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
if (!ret)
panic("alloc_low_pages: can not alloc memory");
memblock_reserve(ret, PAGE_SIZE * num);
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\\n",
pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
for (i = 0; i < num; i++) {
void *adr;
adr = __va((pfn + i) << PAGE_SHIFT);
clear_page(adr);
}
return __va(pfn << PAGE_SHIFT);
}
則是根據前面early_alloc_pgt_buf()申請保留的頁表緩衝空間使用情況來判斷,是從頁表緩衝空間中申請還是通過memblock算法申請頁表內存。
回到page_table_range_init(),其中one_md_table_init()是用於當pgd入參為空時,申請新物理頁作為頁中間目錄的,但是此次僅分析x86非PAE環境的情況,不存在頁中間目錄,故實際上返回的仍是入參。附代碼:
【file:/arch/x86/mm/init_32.c】
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
* in non-PAE compilation mode, since the middle layer is folded.
*/
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
pud_t *pud;
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
pmd_table = pmd_offset(pud, 0);
return pmd_table;
}
接著的是page_table_kmap_check(),其入參調用的one_page_table_init()是用於當入參pmd沒有頁表指向時,創建頁表並使其指向被創建的頁表。page_table_kmap_check()實現:
【file:/arch/x86/mm/init_32.c】
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte,
void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
* Something (early fixmap) may already have put a pte
* page here, which causes the page table allocation
* to become nonlinear. Attempt to fix it, and if it
* is still nonlinear then we have to bug.
*/
int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
*adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
BUG_ON(newpte != pte_offset_kernel(pmd, 0));
__flush_tlb_all();
paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
pte = newpte;
}
BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
&& vaddr > fix_to_virt(FIX_KMAP_END)
&& lastpte && lastpte + PTRS_PER_PTE != pte);
#endif
return pte;
}
可以看到這裡在此出現臨時內核映射區間的標識(FIX_KMAP_END和FIX_KMAP_BEGIN),檢查當前頁表初始化的地址是否處於該區間範圍,如果是,則把其pte頁表的內容拷貝到page_table_range_init()申請的頁表空間中,並將newpte新頁表的地址設置到pmd中(32bit系統實際上就是頁全局目錄),然後調用__flush_tlb_all()刷新TLB緩存;如果不是該區間,則僅是由入參中調用的one_page_table_init()被分配到了頁表空間。
由此,可以知道page_table_range_init()主要是做了什麼了。這是由於kmap_atomic()對該區間的使用,該區間必須保證其頁表連續性。為了避免前期可能對固定映射區已經分配了頁表項,基於臨時內核映射區間要求頁表連續性的保證,所以在此重新申請連續的頁表空間將原頁表內容拷貝至此。值得注意的是,與低端內存的頁表初始化不同的是,這裡的頁表只是被分配,相應的PTE項並未初始化,這個工作將會交由以後各個固定映射區部分的相關代碼調用set_fixmap()來將相關的固定映射區頁表與物理內存關聯。
early_ioremap_page_table_range_init()函數再往下的early_ioremap_reset()僅是對after_paging_init全局變量賦值。
最後退出early_ioremap_page_table_range_init()後,init_mem_mapping()調用load_cr3()刷新CR3寄存器,__flush_tlb_all()則用於刷新TLB,由此啟用新的內存分頁映射。
至此,內核頁表建立完畢。