Ядро Linux в комментариях




Mm/memory.c


31766 /* 31767 * linux/mm/memory.c 31768 * 31769 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 31770 */ 31771 31772 /* demand-loading started 01.12.91 - seems it is high on 31773 * the list of things wanted, and it should be easy to 31774 * implement. - Linus */ 31775 31776 /* Ok, demand-loading was easy, shared pages a little bit 31777 * tricker. Shared pages started 02.12.91, seems to 31778 * work. - Linus. 31779 * 31780 * Tested sharing by executing about 30 /bin/sh: under 31781 * the old kernel it would have taken more than the 6M I 31782 * have free, but it worked well as far as I could see. 31783 * 31784 * Also corrected some "invalidate()"s - I wasn't doing 31785 * enough of them. */ 31786 31787 /* Real VM (paging to/from disk) started 18.12.91. Much 31788 * more work and thought has to go into this. Oh, well.. 31789 * 19.12.91 - works, somewhat. Sometimes I get faults, 31790 * don't know why. Fund it. Everything seems to work 31791 * now. 31792 * 20.12.91 - Ok, making the swap-device changeable like 31793 * the root. */ 31794 /* 05.04.94 - Multi-pg memory management added for v1.1. 31795 * Idea by Alex Bligh (alex@cconcepts.co.uk) 31796 */ 31797 31798 #include <linux/mm.h> 31799 #include <linux/mman.h> 31800 #include <linux/swap.h> 31801 #include <linux/smp_lock.h> 31802 31803 #include <asm/uaccess.h> 31804 #include <asm/pgtable.h> 31805 31806 unsigned long max_mapnr = 0; 31807 unsigned long num_physpages = 0; 31808 void * high_memory = NULL; 31809 31810 /* We special-case the C-O-W ZERO_PAGE, because it's such 31811 * a common occurrence (no need to read the page to know 31812 * that it's zero - better for the cache and memory 31813 * subsystem). */ 31814 static inline void copy_cow_page(unsigned long from, 31815 unsigned long to) 31816 { 31817 if (from == ZERO_PAGE) { 31818 clear_page(to); 31819 return; 31820 } 31821 copy_page(to, from); 31822 } 31823 31824 mem_map_t * mem_map = NULL; 31825 31826 /* oom() prints a message (so that the user knows why the 31827 * process died), and gives the process an untrappable 31828 * SIGKILL. */ 31829 void oom(struct task_struct * task) 31830 { 31831 printk("\nOut of memory for %s.\n", task->comm); 31832 force_sig(SIGKILL, task); 31833 } 31834 31835 /* Note: this doesn't free the actual pages 31836 * themselves. That has been handled earlier when 31837 * unmapping all the memory regions. */ 31838 static inline void free_one_pmd(pmd_t * dir) 31839 { 31840 pte_t * pte; 31841 31842 if (pmd_none(*dir)) 31843 return; 31844 if (pmd_bad(*dir)) { 31845 printk("free_one_pmd: bad directory entry %08lx\n", 31846 pmd_val(*dir)); 31847 pmd_clear(dir); 31848 return; 31849 } 31850 pte = pte_offset(dir, 0); 31851 pmd_clear(dir); 31852 pte_free(pte); 31853 } 31854 31855 static inline void free_one_pgd(pgd_t * dir) 31856 { 31857 int j; 31858 pmd_t * pmd; 31859 31860 if (pgd_none(*dir)) 31861 return; 31862 if (pgd_bad(*dir)) { 31863 printk("free_one_pgd: bad directory entry %08lx\n", 31864 pgd_val(*dir)); 31865 pgd_clear(dir); 31866 return; 31867 } 31868 pmd = pmd_offset(dir, 0); 31869 pgd_clear(dir); 31870 for (j = 0; j < PTRS_PER_PMD ; j++) 31871 free_one_pmd(pmd+j); 31872 pmd_free(pmd); 31873 } 31874 31875 /* Low and high watermarks for page table cache. The 31876 * system should try to have pgt_water[0] <= cache 31877 * elements <= pgt_water[1] */ 31878 int pgt_cache_water[2] = { 25, 50 }; 31879 31880 /* Returns the number of pages freed */ 31881 int check_pgt_cache(void) 31882 { 31883 return do_check_pgt_cache(pgt_cache_water[0], 31884 pgt_cache_water[1]); 31885 } 31886 31887 31888 /* This function clears all user-level page tables of a 31889 * process - this is needed by execve(), so that old 31890 * pages aren't in the way. */ 31891 void clear_page_tables(struct mm_struct *mm, 31892 unsigned long first, int nr) 31893 { 31894 pgd_t * page_dir = mm->pgd; 31895 31896 if (page_dir && page_dir != swapper_pg_dir) { 31897 page_dir += first; 31898 do { 31899 free_one_pgd(page_dir); 31900 page_dir++; 31901 } while (--nr); 31902 31903 /* keep the page table cache within bounds */ 31904 check_pgt_cache(); 31905 } 31906 } 31907 31908 /* This function just free's the page directory - the 31909 * page tables themselves have been freed earlier by 31910 * clear_page_tables(). */ 31911 void free_page_tables(struct mm_struct * mm) 31912 { 31913 pgd_t * page_dir = mm->pgd; 31914 31915 if (page_dir) { 31916 if (page_dir == swapper_pg_dir) 31917 goto out_bad; 31918 pgd_free(page_dir); 31919 } 31920 return; 31921 31922 out_bad: 31923 printk(KERN_ERR 31924 "free_page_tables: Trying to free kernel pgd\n"); 31925 return; 31926 } 31927 31928 int new_page_tables(struct task_struct * tsk) 31929 { 31930 pgd_t * new_pg; 31931 31932 if (!(new_pg = pgd_alloc())) 31933 return -ENOMEM; 31934 SET_PAGE_DIR(tsk, new_pg); 31935 tsk->mm->pgd = new_pg; 31936 return 0; 31937 } 31938 31939 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) 31940 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) 31941 31942 /* copy one vm_area from one task to the other. Assumes 31943 * the page tables already present in the new task to be 31944 * cleared in the whole range covered by this vma. 31945 * 31946 * 08Jan98 Merged into one routine from several inline 31947 * routines to reduce variable count and make things 31948 * faster. -jj */ 31949 int copy_page_range(struct mm_struct *dst, 31950 struct mm_struct *src, 31951 struct vm_area_struct *vma) 31952 { 31953 pgd_t * src_pgd, * dst_pgd; 31954 unsigned long address = vma->vm_start; 31955 unsigned long end = vma->vm_end; 31956 unsigned long cow = 31957 (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) 31958 == VM_MAYWRITE; 31959 31960 src_pgd = pgd_offset(src, address)-1; 31961 dst_pgd = pgd_offset(dst, address)-1; 31962 31963 for (;;) { 31964 pmd_t * src_pmd, * dst_pmd; 31965 31966 src_pgd++; dst_pgd++; 31967 31968 /* copy_pmd_range */ 31969 31970 if (pgd_none(*src_pgd)) 31971 goto skip_copy_pmd_range; 31972 if (pgd_bad(*src_pgd)) { 31973 printk("copy_pmd_range: bad pgd (%08lx)\n", 31974 pgd_val(*src_pgd)); 31975 pgd_clear(src_pgd); 31976 skip_copy_pmd_range: 31977 address = (address + PGDIR_SIZE) & PGDIR_MASK; 31978 if (address >= end) 31979 goto out; 31980 continue; 31981 } 31982 if (pgd_none(*dst_pgd)) { 31983 if (!pmd_alloc(dst_pgd, 0)) 31984 goto nomem; 31985 } 31986 31987 src_pmd = pmd_offset(src_pgd, address); 31988 dst_pmd = pmd_offset(dst_pgd, address); 31989 31990 do { 31991 pte_t * src_pte, * dst_pte; 31992 31993 /* copy_pte_range */ 31994 31995 if (pmd_none(*src_pmd)) 31996 goto skip_copy_pte_range; 31997 if (pmd_bad(*src_pmd)) { 31998 printk("copy_pte_range: bad pmd (%08lx)\n", 31999 pmd_val(*src_pmd)); 32000 pmd_clear(src_pmd); 32001 skip_copy_pte_range: 32002 address = (address + PMD_SIZE) & PMD_MASK; 32003 if (address >= end) 32004 goto out; 32005 goto cont_copy_pmd_range; 32006 } 32007 if (pmd_none(*dst_pmd)) { 32008 if (!pte_alloc(dst_pmd, 0)) 32009 goto nomem; 32010 } 32011 32012 src_pte = pte_offset(src_pmd, address); 32013 dst_pte = pte_offset(dst_pmd, address); 32014 32015 do { 32016 pte_t pte = *src_pte; 32017 unsigned long page_nr; 32018 32019 /* copy_one_pte */ 32020 32021 if (pte_none(pte)) 32022 goto cont_copy_pte_range; 32023 if (!pte_present(pte)) { 32024 swap_duplicate(pte_val(pte)); 32025 set_pte(dst_pte, pte); 32026 goto cont_copy_pte_range; 32027 } 32028 page_nr = MAP_NR(pte_page(pte)); 32029 if (page_nr >= max_mapnr 32030 PageReserved(mem_map+page_nr)) { 32031 set_pte(dst_pte, pte); 32032 goto cont_copy_pte_range; 32033 } 32034 /* If it's a COW mapping, write protect it both 32035 * in the parent and the child */ 32036 if (cow) { 32037 pte = pte_wrprotect(pte); 32038 set_pte(src_pte, pte); 32039 } 32040 /* If it's a shared mapping, mark it clean in the 32041 * child */ 32042 if (vma->vm_flags & VM_SHARED) 32043 pte = pte_mkclean(pte); 32044 set_pte(dst_pte, pte_mkold(pte)); 32045 atomic_inc(&mem_map[page_nr].count); 32046 32047 cont_copy_pte_range: 32048 address += PAGE_SIZE; 32049 if (address >= end) 32050 goto out; 32051 src_pte++; 32052 dst_pte++; 32053 } while ((unsigned long)src_pte & PTE_TABLE_MASK); 32054 32055 cont_copy_pmd_range: 32056 src_pmd++; 32057 dst_pmd++; 32058 } while ((unsigned long)src_pmd & PMD_TABLE_MASK); 32059 } 32060 out: 32061 return 0; 32062 32063 nomem: 32064 return -ENOMEM; 32065 } 32066 32067 /* Return indicates whether a page was freed so caller 32068 * can adjust rss */ 32069 static inline int free_pte(pte_t page) 32070 { 32071 if (pte_present(page)) { 32072 unsigned long addr = pte_page(page); 32073 if (MAP_NR(addr) >= max_mapnr 32074 PageReserved(mem_map+MAP_NR(addr))) 32075 return 0; 32076 /* free_page() used to be able to clear swap cache 32077 * entries. We may now have to do it manually. */ 32078 free_page_and_swap_cache(addr); 32079 return 1; 32080 } 32081 swap_free(pte_val(page)); 32082 return 0; 32083 } 32084 32085 static inline void forget_pte(pte_t page) 32086 { 32087 if (!pte_none(page)) { 32088 printk("forget_pte: old mapping existed!\n"); 32089 free_pte(page); 32090 } 32091 } 32092 32093 static inline int zap_pte_range(pmd_t * pmd, 32094 unsigned long address, unsigned long size) 32095 { 32096 pte_t * pte; 32097 int freed; 32098 32099 if (pmd_none(*pmd)) 32100 return 0; 32101 if (pmd_bad(*pmd)) { 32102 printk("zap_pte_range: bad pmd (%08lx)\n", 32103 pmd_val(*pmd)); 32104 pmd_clear(pmd); 32105 return 0; 32106 } 32107 pte = pte_offset(pmd, address); 32108 address &= ~PMD_MASK; 32109 if (address + size > PMD_SIZE) 32110 size = PMD_SIZE - address; 32111 size >>= PAGE_SHIFT; 32112 freed = 0; 32113 for (;;) { 32114 pte_t page; 32115 if (!size) 32116 break; 32117 page = *pte; 32118 pte++; 32119 size--; 32120 if (pte_none(page)) 32121 continue; 32122 pte_clear(pte-1); 32123 freed += free_pte(page); 32124 } 32125 return freed; 32126 } 32127 32128 static inline int zap_pmd_range(pgd_t * dir, 32129 unsigned long address, unsigned long size) 32130 { 32131 pmd_t * pmd; 32132 unsigned long end; 32133 int freed; 32134 32135 if (pgd_none(*dir)) 32136 return 0; 32137 if (pgd_bad(*dir)) { 32138 printk("zap_pmd_range: bad pgd (%08lx)\n", 32139 pgd_val(*dir)); 32140 pgd_clear(dir); 32141 return 0; 32142 } 32143 pmd = pmd_offset(dir, address); 32144 address &= ~PGDIR_MASK; 32145 end = address + size; 32146 if (end > PGDIR_SIZE) 32147 end = PGDIR_SIZE; 32148 freed = 0; 32149 do { 32150 freed += zap_pte_range(pmd, address, end - address); 32151 address = (address + PMD_SIZE) & PMD_MASK; 32152 pmd++; 32153 } while (address < end); 32154 return freed; 32155 } 32156 32157 /* remove user pages in a given range. */ 32158 void zap_page_range(struct mm_struct *mm, 32159 unsigned long address, unsigned long size) 32160 { 32161 pgd_t * dir; 32162 unsigned long end = address + size; 32163 int freed = 0; 32164 32165 dir = pgd_offset(mm, address); 32166 while (address < end) { 32167 freed += zap_pmd_range(dir, address, end - address); 32168 address = (address + PGDIR_SIZE) & PGDIR_MASK; 32169 dir++; 32170 } 32171 /* Update rss for the mm_struct (not necessarily 32172 * current->mm) */ 32173 if (mm->rss > 0) { 32174 mm->rss -= freed; 32175 if (mm->rss < 0) 32176 mm->rss = 0; 32177 } 32178 } 32179 32180 static inline void zeromap_pte_range(pte_t * pte, 32181 unsigned long address, unsigned long size, 32182 pte_t zero_pte) 32183 { 32184 unsigned long end; 32185 32186 address &= ~PMD_MASK; 32187 end = address + size; 32188 if (end > PMD_SIZE) 32189 end = PMD_SIZE; 32190 do { 32191 pte_t oldpage = *pte; 32192 set_pte(pte, zero_pte); 32193 forget_pte(oldpage); 32194 address += PAGE_SIZE; 32195 pte++; 32196 } while (address < end); 32197 } 32198 32199 static inline int zeromap_pmd_range(pmd_t * pmd, 32200 unsigned long address, unsigned long size, 32201 pte_t zero_pte) 32202 { 32203 unsigned long end; 32204 32205 address &= ~PGDIR_MASK; 32206 end = address + size; 32207 if (end > PGDIR_SIZE) 32208 end = PGDIR_SIZE; 32209 do { 32210 pte_t * pte = pte_alloc(pmd, address); 32211 if (!pte) 32212 return -ENOMEM; 32213 zeromap_pte_range(pte, address, end - address, 32214 zero_pte); 32215 address = (address + PMD_SIZE) & PMD_MASK; 32216 pmd++; 32217 } while (address < end); 32218 return 0; 32219 } 32220 32221 int zeromap_page_range(unsigned long address, 32222 unsigned long size, pgprot_t prot) 32223 { 32224 int error = 0; 32225 pgd_t * dir; 32226 unsigned long beg = address; 32227 unsigned long end = address + size; 32228 pte_t zero_pte; 32229 32230 zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot)); 32231 dir = pgd_offset(current->mm, address); 32232 flush_cache_range(current->mm, beg, end); 32233 while (address < end) { 32234 pmd_t *pmd = pmd_alloc(dir, address); 32235 error = -ENOMEM; 32236 if (!pmd) 32237 break; 32238 error = zeromap_pmd_range(pmd, address, 32239 end - address, zero_pte); 32240 if (error) 32241 break; 32242 address = (address + PGDIR_SIZE) & PGDIR_MASK; 32243 dir++; 32244 } 32245 flush_tlb_range(current->mm, beg, end); 32246 return error; 32247 } 32248 32249 /* maps a range of physical memory into the requested 32250 * pages. the old mappings are removed. any references to 32251 * nonexistent pages results in null mappings (currently 32252 * treated as "copy-on-access") */ 32253 static inline void remap_pte_range(pte_t * pte, 32254 unsigned long address, unsigned long size, 32255 unsigned long phys_addr, pgprot_t prot) 32256 { 32257 unsigned long end; 32258 32259 address &= ~PMD_MASK; 32260 end = address + size; 32261 if (end > PMD_SIZE) 32262 end = PMD_SIZE; 32263 do { 32264 unsigned long mapnr; 32265 pte_t oldpage = *pte; 32266 pte_clear(pte); 32267 32268 mapnr = MAP_NR(__va(phys_addr)); 32269 if (mapnr >= max_mapnr 32270 PageReserved(mem_map+mapnr)) 32271 set_pte(pte, mk_pte_phys(phys_addr, prot)); 32272 forget_pte(oldpage); 32273 address += PAGE_SIZE; 32274 phys_addr += PAGE_SIZE; 32275 pte++; 32276 } while (address < end); 32277 } 32278 32279 static inline int remap_pmd_range(pmd_t * pmd, 32280 unsigned long address, unsigned long size, 32281 unsigned long phys_addr, pgprot_t prot) 32282 { 32283 unsigned long end; 32284 32285 address &= ~PGDIR_MASK; 32286 end = address + size; 32287 if (end > PGDIR_SIZE) 32288 end = PGDIR_SIZE; 32289 phys_addr -= address; 32290 do { 32291 pte_t * pte = pte_alloc(pmd, address); 32292 if (!pte) 32293 return -ENOMEM; 32294 remap_pte_range(pte, address, end - address, 32295 address + phys_addr, prot); 32296 address = (address + PMD_SIZE) & PMD_MASK; 32297 pmd++; 32298 } while (address < end); 32299 return 0; 32300 } 32301 32302 int remap_page_range(unsigned long from, 32303 unsigned long phys_addr, unsigned long size, 32304 pgprot_t prot) 32305 { 32306 int error = 0; 32307 pgd_t * dir; 32308 unsigned long beg = from; 32309 unsigned long end = from + size; 32310 32311 phys_addr -= from; 32312 dir = pgd_offset(current->mm, from); 32313 flush_cache_range(current->mm, beg, end); 32314 while (from < end) { 32315 pmd_t *pmd = pmd_alloc(dir, from); 32316 error = -ENOMEM; 32317 if (!pmd) 32318 break; 32319 error = remap_pmd_range(pmd, from, end - from, 32320 phys_addr + from, prot); 32321 if (error) 32322 break; 32323 from = (from + PGDIR_SIZE) & PGDIR_MASK; 32324 dir++; 32325 } 32326 flush_tlb_range(current->mm, beg, end); 32327 return error; 32328 } 32329 32330 /* sanity-check function.. */ 32331 static void put_page(pte_t * page_table, pte_t pte) 32332 { 32333 if (!pte_none(*page_table)) { 32334 free_page_and_swap_cache(pte_page(pte)); 32335 return; 32336 } 32337 /* no need for flush_tlb */ 32338 set_pte(page_table, pte); 32339 } 32340 32341 /* This routine is used to map in a page into an address 32342 * space: needed by execve() for the initial stack and 32343 * environment pages. */ 32344 unsigned long put_dirty_page(struct task_struct * tsk, 32345 unsigned long page, unsigned long address) 32346 { 32347 pgd_t * pgd; 32348 pmd_t * pmd; 32349 pte_t * pte; 32350 32351 if (MAP_NR(page) >= max_mapnr) 32352 printk("put_dirty_page: trying to put page %08lx at " 32353 "%08lx\n",page,address); 32354 if (atomic_read(&mem_map[MAP_NR(page)].count) != 1) 32355 printk("mem_map disagrees with %08lx at %08lx\n", 32356 page,address); 32357 pgd = pgd_offset(tsk->mm,address); 32358 pmd = pmd_alloc(pgd, address); 32359 if (!pmd) { 32360 free_page(page); 32361 oom(tsk); 32362 return 0; 32363 } 32364 pte = pte_alloc(pmd, address); 32365 if (!pte) { 32366 free_page(page); 32367 oom(tsk); 32368 return 0; 32369 } 32370 if (!pte_none(*pte)) { 32371 printk("put_dirty_page: page already exists\n"); 32372 free_page(page); 32373 return 0; 32374 } 32375 flush_page_to_ram(page); 32376 set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, 32377 PAGE_COPY)))); 32378 /* no need for flush_tlb */ 32379 return page; 32380 } 32381 32382 /* This routine handles present pages, when users try to 32383 * write to a shared page. It is done by copying the page 32384 * to a new address and decrementing the shared-page 32385 * counter for the old page. 32386 * 32387 * Goto-purists beware: the only reason for goto's here 32388 * is that it results in better assembly code.. The 32389 * "default" path will see no jumps at all. 32390 * 32391 * Note that this routine assumes that the protection 32392 * checks have been done by the caller (the low-level 32393 * page fault routine in most cases). Thus we can safely 32394 * just mark it writable once we've done any necessary 32395 * COW. 32396 * 32397 * We also mark the page dirty at this point even though 32398 * the page will change only once the write actually 32399 * happens. This avoids a few races, and potentially 32400 * makes it more efficient. */




Содержание  Назад  Вперед