[PATCH] mincore for i386, against 2.3.40-3

From: Chuck Lever (cel@monkey.org)
Date: Thu Jan 20 2000 - 14:25:29 EST

  • Next message: Davide Libenzi: "Re: Interesting analysis of linux kernel threading by IBM"

    hi all,

    here's the mincore implementation again, with some improvements thanks to
    Jeff Garzik and Chris Wing.

    - mincore no longer uses vmalloc
    - mincore can scan through arbitrarily large vm areas using only a fixed
    amount of kernel memory
    - vm_operations_struct instantiations use the new method which declares
    only the structure fields needed, the rest being initialized as zero
    - mincore_area loop rewritten to extract the "if"
    - default incore function added, which simply checks pte_present

    comments welcome.

    diff -ruN Linux-2.3.40-3/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
    --- Linux-2.3.40-3/arch/i386/kernel/entry.S Tue Jan 11 12:40:43 2000
    +++ linux/arch/i386/kernel/entry.S Wed Jan 19 18:14:46 2000
    @@ -617,6 +617,7 @@
             .long SYMBOL_NAME(sys_setgid)
             .long SYMBOL_NAME(sys_setfsuid) /* 215 */
             .long SYMBOL_NAME(sys_setfsgid)
    + .long SYMBOL_NAME(sys_mincore)
     
     
             /*
    @@ -625,6 +626,6 @@
              * entries. Don't panic if you notice that this hasn't
              * been shrunk every time we add a new system call.
              */
    - .rept NR_syscalls-216
    + .rept NR_syscalls-217
                     .long SYMBOL_NAME(sys_ni_syscall)
             .endr
    diff -ruN Linux-2.3.40-3/drivers/sgi/char/graphics.c linux/drivers/sgi/char/graphics.c
    --- Linux-2.3.40-3/drivers/sgi/char/graphics.c Tue Jan 11 12:33:01 2000
    +++ linux/drivers/sgi/char/graphics.c Wed Jan 19 17:41:48 2000
    @@ -254,15 +254,7 @@
      */
     
     static struct vm_operations_struct graphics_mmap = {
    - NULL, /* no special mmap-open */
    - NULL, /* no special mmap-close */
    - NULL, /* no special mmap-unmap */
    - NULL, /* no special mmap-protect */
    - NULL, /* no special mmap-sync */
    - NULL, /* no special mmap-advise */
    - sgi_graphics_nopage, /* our magic no-page fault handler */
    - NULL, /* no special mmap-wppage */
    - NULL /* no special mmap-swapout */
    + nopage: sgi_graphics_nopage, /* our magic no-page fault handler */
     };
             
     int
    diff -ruN Linux-2.3.40-3/drivers/sgi/char/shmiq.c linux/drivers/sgi/char/shmiq.c
    --- Linux-2.3.40-3/drivers/sgi/char/shmiq.c Tue Jan 11 12:33:01 2000
    +++ linux/drivers/sgi/char/shmiq.c Wed Jan 19 17:42:12 2000
    @@ -296,15 +296,7 @@
     }
     
     static struct vm_operations_struct qcntl_mmap = {
    - NULL, /* no special mmap-open */
    - NULL, /* no special mmap-close */
    - NULL, /* no special mmap-unmap */
    - NULL, /* no special mmap-protect */
    - NULL, /* no special mmap-sync */
    - NULL, /* no special mmap-advise */
    - shmiq_nopage, /* our magic no-page fault handler */
    - NULL, /* no special mmap-wppage */
    - NULL /* no special mmap-swapout */
    + nopage: shmiq_nopage, /* our magic no-page fault handler */
     };
     
     static int
    diff -ruN Linux-2.3.40-3/fs/ncpfs/mmap.c linux/fs/ncpfs/mmap.c
    --- Linux-2.3.40-3/fs/ncpfs/mmap.c Wed Nov 24 16:47:44 1999
    +++ linux/fs/ncpfs/mmap.c Wed Jan 19 17:42:47 2000
    @@ -94,15 +94,7 @@
     
     struct vm_operations_struct ncp_file_mmap =
     {
    - NULL, /* open */
    - NULL, /* close */
    - NULL, /* unmap */
    - NULL, /* protect */
    - NULL, /* sync */
    - NULL, /* advise */
    - ncp_file_mmap_nopage, /* nopage */
    - NULL, /* wppage */
    - NULL /* swapout */
    + nopage: ncp_file_mmap_nopage,
     };
     
     
    diff -ruN Linux-2.3.40-3/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h
    --- Linux-2.3.40-3/include/asm-i386/unistd.h Tue Jan 11 12:40:56 2000
    +++ linux/include/asm-i386/unistd.h Tue Jan 18 15:25:43 2000
    @@ -221,6 +221,7 @@
     #define __NR_setgid32 214
     #define __NR_setfsuid32 215
     #define __NR_setfsgid32 216
    +#define __NR_mincore 217
     
     /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
     
    diff -ruN Linux-2.3.40-3/include/linux/mm.h linux/include/linux/mm.h
    --- Linux-2.3.40-3/include/linux/mm.h Fri Jan 14 15:43:09 2000
    +++ linux/include/linux/mm.h Wed Jan 19 17:26:47 2000
    @@ -106,6 +106,7 @@
             void (*protect)(struct vm_area_struct *area, unsigned long, size_t, unsigned int newprot);
             int (*sync)(struct vm_area_struct *area, unsigned long, size_t, unsigned int flags);
             void (*advise)(struct vm_area_struct *area, unsigned long, size_t, unsigned int advise);
    + char (*incore)(struct vm_area_struct *area, unsigned long);
             struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int write_access);
             struct page * (*wppage)(struct vm_area_struct * area, unsigned long address, struct page * page);
             int (*swapout)(struct page *, struct file *);
    @@ -383,6 +384,7 @@
     extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
     extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
     
    +extern char default_incore(struct vm_area_struct * vma, unsigned long address);
     extern void vmtruncate(struct inode * inode, loff_t offset);
     extern int handle_mm_fault(struct task_struct *tsk,struct vm_area_struct *vma, unsigned long address, int write_access);
     extern int make_pages_present(unsigned long addr, unsigned long end);
    @@ -446,6 +448,8 @@
                             size_t size, unsigned int flags);
     extern struct page *filemap_nopage(struct vm_area_struct * area,
                                         unsigned long address, int no_share);
    +extern char filemap_incore(struct vm_area_struct * vma,
    + unsigned long pgoff);
     
     /*
      * GFP bitmasks..
    diff -ruN Linux-2.3.40-3/ipc/shm.c linux/ipc/shm.c
    --- Linux-2.3.40-3/ipc/shm.c Fri Jan 14 15:43:09 2000
    +++ linux/ipc/shm.c Wed Jan 19 17:42:27 2000
    @@ -64,6 +64,7 @@
     static void killseg (int shmid);
     static void shm_open (struct vm_area_struct *shmd);
     static void shm_close (struct vm_area_struct *shmd);
    +static char shm_incore (struct vm_area_struct *shmd, unsigned long pgoff);
     static struct page * shm_nopage(struct vm_area_struct *, unsigned long, int);
     static int shm_swapout(struct page *, struct file *);
     #ifdef CONFIG_PROC_FS
    @@ -584,15 +585,11 @@
      */
     
     static struct vm_operations_struct shm_vm_ops = {
    - shm_open, /* open - callback for a new vm-area open */
    - shm_close, /* close - callback for when the vm-area is released */
    - NULL, /* no need to sync pages at unmap */
    - NULL, /* protect */
    - NULL, /* sync */
    - NULL, /* advise */
    - shm_nopage, /* nopage */
    - NULL, /* wppage */
    - shm_swapout /* swapout */
    + open: shm_open,
    + close: shm_close,
    + incore: shm_incore,
    + nopage: shm_nopage,
    + swapout: shm_swapout,
     };
     
     /* Insert shmd into the list shp->attaches */
    @@ -829,6 +826,34 @@
     static int shm_swapout(struct page * page, struct file *file)
     {
             return 0;
    +}
    +
    +/*
    + * is page present?
    + */
    +static char shm_incore(struct vm_area_struct * shmd, unsigned long pgoff)
    +{
    + char result = 0;
    + pte_t pte;
    + struct shmid_kernel * shp = (struct shmid_kernel *) shmd->vm_private_data;
    +
    + down(&shp->sem);
    + if(shp != shm_lock(shp->id))
    + BUG();
    +
    + pte = SHM_ENTRY(shp, pgoff);
    +
    + /*
    + * the pte isn't present if the page is swapped, or if it hasn't
    + * been touched yet.
    + */
    + if (pte_present(pte))
    + result = 1;
    +
    + shm_unlock(shp->id);
    + up(&shp->sem);
    +
    + return result;
     }
     
     /*
    diff -ruN Linux-2.3.40-3/mm/filemap.c linux/mm/filemap.c
    --- Linux-2.3.40-3/mm/filemap.c Fri Jan 14 15:43:09 2000
    +++ linux/mm/filemap.c Wed Jan 19 17:42:41 2000
    @@ -700,6 +700,30 @@
             return page;
     }
     
    +/*
    + * This predicate returns 1 if the page is "in core," otherwise 0.
    + *
    + * Later we can get more picky about what "in core" means precisely,
    + * but for now, it simply checks to see if the page is in the page
    + * cache, and is up to date; i.e. that no page-in operation would be
    + * required at this time if an application were to map and access
    + * this page.
    + */
    +char filemap_incore(struct vm_area_struct * vma, unsigned long pgoff)
    +{
    + char result = 0;
    + struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
    + struct page * page, ** hash = page_hash(as, pgoff);
    +
    + spin_lock(&pagecache_lock);
    + page = __find_page_nolock(as, pgoff, *hash);
    + if ((page) && (Page_Uptodate(page)))
    + result = 1;
    + spin_unlock(&pagecache_lock);
    +
    + return result;
    +}
    +
     #if 0
     #define PROFILE_READAHEAD
     #define DEBUG_READAHEAD
    @@ -1627,15 +1651,11 @@
      * backing-store for swapping..
      */
     static struct vm_operations_struct file_shared_mmap = {
    - NULL, /* no special open */
    - NULL, /* no special close */
    - filemap_unmap, /* unmap - we need to sync the pages */
    - NULL, /* no special protect */
    - filemap_sync, /* sync */
    - NULL, /* advise */
    - filemap_nopage, /* nopage */
    - NULL, /* wppage */
    - filemap_swapout /* swapout */
    + unmap: filemap_unmap, /* unmap - we need to sync the pages */
    + sync: filemap_sync,
    + incore: filemap_incore,
    + nopage: filemap_nopage,
    + swapout: filemap_swapout,
     };
     
     /*
    @@ -1645,15 +1665,8 @@
      * know they can't ever get write permissions..)
      */
     static struct vm_operations_struct file_private_mmap = {
    - NULL, /* open */
    - NULL, /* close */
    - NULL, /* unmap */
    - NULL, /* protect */
    - NULL, /* sync */
    - NULL, /* advise */
    - filemap_nopage, /* nopage */
    - NULL, /* wppage */
    - NULL /* swapout */
    + incore: filemap_incore,
    + nopage: filemap_nopage,
     };
     
     /* This is used for a general mmap of a disk file */
    diff -ruN Linux-2.3.40-3/mm/memory.c linux/mm/memory.c
    --- Linux-2.3.40-3/mm/memory.c Tue Jan 11 12:40:57 2000
    +++ linux/mm/memory.c Wed Jan 19 17:24:45 2000
    @@ -412,6 +412,23 @@
             return NULL;
     }
     
    +/*
    + * default vm operation for mincore() system call
    + */
    +char default_incore(struct vm_area_struct * vma, unsigned long address)
    +{
    + pgd_t *pgd = pgd_offset(vma->vm_mm, address);
    + pmd_t *pmd = pmd_offset(pgd, address);
    +
    + if (pmd) {
    + pte_t * pte = pte_offset(pmd, address);
    + if (pte && pte_present(*pte))
    + return 1;
    + }
    +
    + return 0;
    +}
    +
     /*
      * Given a physical address, is there a useful struct page pointing to it?
      */
    diff -ruN Linux-2.3.40-3/mm/mmap.c linux/mm/mmap.c
    --- Linux-2.3.40-3/mm/mmap.c Tue Dec 21 13:59:00 1999
    +++ linux/mm/mmap.c Wed Jan 19 17:55:57 2000
    @@ -720,6 +720,142 @@
     }
     
     /*
    + * If mincore_area can't find an "incore" predicate for this vma,
    + * just assume that the pages are present. This is reasonable for
    + * device drivers and file systems that don't use the page cache.
    + */
    +static long mincore_area(struct vm_area_struct * vma,
    + unsigned long start, unsigned long end, char * vec)
    +{
    + int remaining, error, size, i;
    + char * tmp;
    +
    + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
    + if (end > vma->vm_end)
    + end = vma->vm_end;
    + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
    +
    + /* # of bytes in "vec" = # of pages */
    + size = end - start;
    +
    + tmp = (char *) __get_free_page(GFP_KERNEL);
    + if (!tmp)
    + return -ENOMEM;
    +
    + error = 0;
    + for (remaining = size, i = 0;
    + remaining > 0;
    + remaining -= PAGE_SIZE, i++) {
    + int j = 0;
    + int thispiece = (remaining < PAGE_SIZE) ? remaining : PAGE_SIZE;
    +
    + if (vma->vm_ops && vma->vm_ops->incore) {
    + while (j < thispiece)
    + tmp[j++] = vma->vm_ops->incore(vma, start++);
    + } else {
    + while (j < thispiece)
    + tmp[j++] = default_incore(vma, start++);
    + }
    +
    + if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
    + error = -EFAULT;
    + break;
    + }
    + }
    +
    + free_page((unsigned long) tmp);
    + return error;
    +}
    +
    +/*
    + * The mincore(2) system call.
    + *
    + * mincore() returns the memory residency status of the pages in the
    + * current process's address space specified by [addr, addr + len).
    + * The status is returned in a vector of bytes. The least significant
    + * bit of each byte is 1 if the referenced page is in memory, otherwise
    + * it is zero.
    + *
    + * Because the status of a page can change after mincore() checks it
    + * but before it returns to the application, the returned vector may
    + * contain stale information. Only locked pages are guaranteed to
    + * remain in memory.
    + *
    + * return values:
    + * zero - success
    + * -EFAULT - vec points to an illegal address
    + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
    + * or len has a nonpositive value
    + * -ENOMEM - Addresses in the range [addr, addr + len] are
    + * invalid for the address space of this process, or
    + * specify one or more pages which are not currently
    + * mapped
    + */
    +asmlinkage long sys_mincore(unsigned long start, size_t len, char *vec)
    +{
    + int index = 0;
    + unsigned long end;
    + struct vm_area_struct * vma;
    + int unmapped_error = 0;
    + int error = -EINVAL;
    +
    + down(&current->mm->mmap_sem);
    +
    + if (start & ~PAGE_MASK)
    + goto out;
    + len = (len + ~PAGE_MASK) & PAGE_MASK;
    + end = start + len;
    + if (end < start)
    + goto out;
    +
    + error = 0;
    + if (end == start)
    + goto out;
    +
    + /*
    + * If the interval [start,end) covers some unmapped address
    + * ranges, just ignore them, but return -ENOMEM at the end.
    + */
    + vma = find_vma(current->mm, start);
    + for (;;) {
    + /* Still start < end. */
    + error = -ENOMEM;
    + if (!vma)
    + goto out;
    +
    + /* Here start < vma->vm_end. */
    + if (start < vma->vm_start) {
    + unmapped_error = -ENOMEM;
    + start = vma->vm_start;
    + }
    +
    + /* Here vma->vm_start <= start < vma->vm_end. */
    + if (end <= vma->vm_end) {
    + if (start < end) {
    + error = mincore_area(vma, start, end,
    + &vec[index]);
    + if (error)
    + goto out;
    + }
    + error = unmapped_error;
    + goto out;
    + }
    +
    + /* Here vma->vm_start <= start < vma->vm_end < end. */
    + error = mincore_area(vma, start, vma->vm_end, &vec[index]);
    + if (error)
    + goto out;
    + index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
    + start = vma->vm_end;
    + vma = vma->vm_next;
    + }
    +
    +out:
    + up(&current->mm->mmap_sem);
    + return error;
    +}
    +
    +/*
      * this is really a simplified "do_mmap". it only handles
      * anonymous maps. eventually we may be able to do some
      * brk-specific accounting here.
    diff -ruN Linux-2.3.40-3/net/packet/af_packet.c linux/net/packet/af_packet.c
    --- Linux-2.3.40-3/net/packet/af_packet.c Fri Dec 31 15:04:42 1999
    +++ linux/net/packet/af_packet.c Wed Jan 19 17:42:55 2000
    @@ -1537,15 +1537,8 @@
     }
     
     static struct vm_operations_struct packet_mmap_ops = {
    - packet_mm_open, /* open */
    - packet_mm_close, /* close */
    - NULL, /* unmap */
    - NULL, /* no special protect */
    - NULL, /* sync */
    - NULL, /* advise */
    - NULL, /* nopage */
    - NULL, /* wppage */
    - NULL /* swapout */
    + open: packet_mm_open,
    + close: packet_mm_close,
     };
     
     static void free_pg_vec(unsigned long *pg_vec, unsigned order, unsigned len)

            - Chuck Lever

    --
    corporate:	<chuckl@netscape.com>
    personal:	<chucklever@netscape.net> or <cel@monkey.org>
    

    The Linux Scalability project: http://www.citi.umich.edu/projects/linux-scalability/

    - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/



    This archive was generated by hypermail 2b29 : Fri Jan 21 2000 - 16:04:23 EST