1. mmap_pgoff的系统调用实现如下
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}
2. ksys_mmap_pgoff函数
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
return -EBADF;
if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (unlikely(flags & MAP_HUGETLB)) {
retval = -EINVAL;
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
fput(file);
return retval;
}
vm_mmap_pgoff函数
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
&uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
do_mmap函数
/*
* The caller must write-lock current->mm->mmap_lock.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, vm_flags_t vm_flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
int pkey = 0;
*populate = 0;
if (!len)
return -EINVAL;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
if (find_vma_intersection(mm, addr, addr + len))
return -EEXIST;
}
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (!mlock_future_ok(mm, vm_flags, len))
return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
/*
* Force use of MAP_SHARED_VALIDATE with non-legacy
* flags. E.g. MAP_SYNC is dangerous to use with
* MAP_SHARED as you don't know which consistency model
* you will get. We silently ignore unsupported flags
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
fallthrough;
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
if (prot & PROT_WRITE) {
if (!(file->f_mode & FMODE_WRITE))
return -EACCES;
if (IS_SWAPFILE(file->f_mapping->host))
return -ETXTBSY;
}
/*
* Make sure we don't allow writing to an append-only
* file..
*/
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
*/
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
get_unmapped_area函数
do_mmap的函数中的会使用get_unmapped_area获取addr
get_unmapped_area
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
unsigned long error = arch_mmap_check(addr, len, flags);
if (error)
return error;
/* Careful about overflows.. */
if (len > TASK_SIZE)
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
if (file) {
if (file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
} else if (flags & MAP_SHARED) {
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
* do_mmap() will clear pgoff, so match alignment.
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (offset_in_page(addr))
return -EINVAL;
error = security_mmap_addr(addr);
return error ? error : addr;
}
mmap_region函数
get_unmapped_area获取到addr,之后addr传给mmap_region函数
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct vm_area_struct *next, *prev, *merge;
pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
pgoff_t vm_pgoff;
int error;
VMA_ITERATOR(vmi, mm, addr);
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
nr_pages = count_vma_pages_range(mm, addr, end);
if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
/* Unmap any existing mapping in the area */
if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
next = vma_next(&vmi);
prev = vma_prev(&vmi);
if (vm_flags & VM_SPECIAL) {
if (prev)
vma_iter_next_range(&vmi);
goto cannot_expand;
}
/* Attempt to expand an old mapping */
/* Check next */
if (next && next->vm_start == end && !vma_policy(next) &&
can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
NULL_VM_UFFD_CTX, NULL)) {
merge_end = next->vm_end;
vma = next;
vm_pgoff = next->vm_pgoff - pglen;
}
/* Check prev */
if (prev && prev->vm_end == addr && !vma_policy(prev) &&
(vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
pgoff, vma->vm_userfaultfd_ctx, NULL) :
can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
NULL_VM_UFFD_CTX, NULL))) {
merge_start = prev->vm_start;
vma = prev;
vm_pgoff = prev->vm_pgoff;
} else if (prev) {
vma_iter_next_range(&vmi);
}
/* Actually expand, if possible */
if (vma &&
!vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
khugepaged_enter_vma(vma, vm_flags);
goto expanded;
}
if (vma == prev)
vma_iter_set(&vmi, addr);
cannot_expand:
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma_iter_config(&vmi, addr, end);
vma->vm_start = addr;
vma->vm_end = end;
vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto free_vma;
}
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
/*
* Expansion is handled above, merging is handled below.
* Drivers should not alter the address of the VMA.
*/
error = -EINVAL;
if (WARN_ON((addr != vma->vm_start)))
goto close_and_free_vma;
vma_iter_config(&vmi, addr, end);
/*
* If vm_flags changed after call_mmap(), we should try merge
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(&vmi, mm, prev, vma->vm_start,
vma->vm_end, vma->vm_flags, NULL,
vma->vm_file, vma->vm_pgoff, NULL,
NULL_VM_UFFD_CTX, NULL);
if (merge) {
/*
* ->mmap() can change vma->vm_file and fput
* the original file. So fput the vma->vm_file
* here or we would add an extra fput for file
* and cause general protection fault
* ultimately.
*/
fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
/* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
goto unmap_writable;
}
}
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma);
}
if (map_deny_write_exec(vma, vma->vm_flags)) {
error = -EACCES;
goto close_and_free_vma;
}
/* Allow architectures to sanity-check the vm_flags */
error = -EINVAL;
if (!arch_validate_flags(vma->vm_flags))
goto close_and_free_vma;
error = -ENOMEM;
if (vma_iter_prealloc(&vmi, vma))
goto close_and_free_vma;
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
vma_iter_store(&vmi, vma);
mm->map_count++;
if (vma->vm_file) {
i_mmap_lock_write(vma->vm_file->f_mapping);
if (vma->vm_flags & VM_SHARED)
mapping_allow_writable(vma->vm_file->f_mapping);
flush_dcache_mmap_lock(vma->vm_file->f_mapping);
vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
i_mmap_unlock_write(vma->vm_file->f_mapping);
}
/*
* vma_merge() calls khugepaged_enter_vma() either, the below
* call covers the non-merge case.
*/
khugepaged_enter_vma(vma, vma->vm_flags);
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
if (file && vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
file = vma->vm_file;
ksm_add_vma(vma);
expanded:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vm_flags_clear(vma, VM_LOCKED_MASK);
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vm_flags_set(vma, VM_SOFTDIRTY);
vma_set_page_prot(vma);
validate_mm(mm);
return addr;
close_and_free_vma:
if (file && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (file || vma->vm_file) {
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
vma->vm_end, vma->vm_end, true);
}
if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
validate_mm(mm);
return error;
}