一、 启动xv6
- 当基于risc-v指令集的计算机通电之后,会读取存储在ROM(read-only memory)的引导程序(boot loader),该引导程序将xv6的内核代码加载到内存(
0x80000000
,为什么是这里,因为0x:0~0x0x80000000是设备的地址)。 - cpu在机器模式(machine mode)下开始从
_entry
处执行xv6的内核。
下面是kernel/entry.S
的代码# qemu -kernel loads the kernel at 0x80000000 # and causes each CPU to jump there. # kernel.ld causes the following code to # be placed at 0x80000000. .section .text .global _entry _entry: # set up a stack for C. # stack0 is declared in start.c, # with a 4096-byte stack per CPU. # sp = stack0 + (hartid * 4096) la sp, stack0 li a0, 1024*4 csrr a1, mhartid addi a1, a1, 1 mul a0, a0, a1 add sp, sp, a0 # jump to start() in start.c call start spin: j spin
.section
:是risc-v汇编语言中的一个伪指令,用于告诉汇编器接下来的代码或数据应该放在哪个区域,.section .text:表示接下来的代码应该放在代码区域。
.global
:是伪指令(directive),用于告诉汇编器和链接器某个内容是对外可见(可以被其他源文件引用)。
la
:是伪指令,加载地址(load address),sp
是栈指针寄存器(stack pointer),stack0是一个符号表示进程/线程的初始栈空间。
li
:是伪指令,加载立即数(load immediate),a0
是risc-v架构中的通用寄存器,用于保存函数参数或中间结果。
csrr
:是riscv的指令,从某个控制状态寄存器中读取内容(control and status register read),这里的mhartid
是csr中的一个,标识cpu编号。
addi
:表示把一个立即数加到寄存器中(add immediate)。
mul
:表示把两个寄存器中的内容相乘。
call
:riscv的伪指令,用于调用函数,这里是跳转到start位置去执行代码。
j
:无条件跳转指令。注意和call的区别,call会自动保存返回地址到ra寄存器,call调用完函数会返回,j则不会,call是一条伪指令,而j是指令(伪指令和指令的区别是伪指令并不和机器码一一对应,而是由汇编器变成多条汇编指令) - 执行start函数start():
start()
↓
设置 mstatus -> Supervisor Mode
↓
设置 mepc -> main()
↓
关闭分页
↓
中断/异常委托给 Supervisor
↓
PMP 允许访问所有内存
↓
初始化定时器
↓
保存 hartid 到 tp
↓
mret -> 跳转到 main() - 执行main()函数
二、 第一个进程process
xv6的main.c
函数
#include "types.h"
#include "param.h"
#include "memlayout.h"
#include "riscv.h"
#include "defs.h"
volatile static int started = 0;
// start() jumps here in supervisor mode on all CPUs.
void
main()
{
if(cpuid() == 0){
consoleinit();
printfinit();
printf("\n");
printf("xv6 kernel is booting\n");
printf("\n");
kinit(); // physical page allocator
kvminit(); // create kernel page table
kvminithart(); // turn on paging
procinit(); // process table
trapinit(); // trap vectors
trapinithart(); // install kernel trap vector
plicinit(); // set up interrupt controller
plicinithart(); // ask PLIC for device interrupts
binit(); // buffer cache
iinit(); // inode table
fileinit(); // file table
virtio_disk_init(); // emulated hard disk
userinit(); // first user process
__sync_synchronize();
started = 1;
} else {
while(started == 0)
;
__sync_synchronize();
printf("hart %d starting\n", cpuid());
kvminithart(); // turn on paging
trapinithart(); // install kernel trap vector
plicinithart(); // ask PLIC for device interrupts
}
scheduler();
}
xv6是主cpu进程初始化,其他cpu等待主cpu初始化完成后,再进程初始化
其中userinit()
函数中完成了创建第一个用户态进程。
// Set up first user process.
void
userinit(void)
{
struct proc *p;
p = allocproc();
initproc = p;
// allocate one user page and copy init's instructions
// and data into it.
uvminit(p->pagetable, initcode, sizeof(initcode));
p->sz = PGSIZE;
// prepare for the very first "return" from kernel to user.
p->trapframe->epc = 0; // user program counter
p->trapframe->sp = PGSIZE; // user stack pointer
safestrcpy(p->name, "initcode", sizeof(p->name));
p->cwd = namei("/");
p->state = RUNNABLE;
release(&p->lock);
}
其中结构体proc如下,保存进程的相关信息
// Saved registers for kernel context switches.
struct context {
uint64 ra;
uint64 sp;
// callee-saved
uint64 s0;
uint64 s1;
uint64 s2;
uint64 s3;
uint64 s4;
uint64 s5;
uint64 s6;
uint64 s7;
uint64 s8;
uint64 s9;
uint64 s10;
uint64 s11;
};
// Per-CPU state.
struct cpu {
struct proc *proc; // The process running on this cpu, or null.
struct context context; // swtch() here to enter scheduler().
int noff; // Depth of push_off() nesting.
int intena; // Were interrupts enabled before push_off()?
};
extern struct cpu cpus[NCPU];
// per-process data for the trap handling code in trampoline.S.
// sits in a page by itself just under the trampoline page in the
// user page table. not specially mapped in the kernel page table.
// the sscratch register points here.
// uservec in trampoline.S saves user registers in the trapframe,
// then initializes registers from the trapframe's
// kernel_sp, kernel_hartid, kernel_satp, and jumps to kernel_trap.
// usertrapret() and userret in trampoline.S set up
// the trapframe's kernel_*, restore user registers from the
// trapframe, switch to the user page table, and enter user space.
// the trapframe includes callee-saved user registers like s0-s11 because the
// return-to-user path via usertrapret() doesn't return through
// the entire kernel call stack.
struct trapframe {
/* 0 */ uint64 kernel_satp; // kernel page table
/* 8 */ uint64 kernel_sp; // top of process's kernel stack
/* 16 */ uint64 kernel_trap; // usertrap()
/* 24 */ uint64 epc; // saved user program counter
/* 32 */ uint64 kernel_hartid; // saved kernel tp
/* 40 */ uint64 ra;
/* 48 */ uint64 sp;
/* 56 */ uint64 gp;
/* 64 */ uint64 tp;
/* 72 */ uint64 t0;
/* 80 */ uint64 t1;
/* 88 */ uint64 t2;
/* 96 */ uint64 s0;
/* 104 */ uint64 s1;
/* 112 */ uint64 a0;
/* 120 */ uint64 a1;
/* 128 */ uint64 a2;
/* 136 */ uint64 a3;
/* 144 */ uint64 a4;
/* 152 */ uint64 a5;
/* 160 */ uint64 a6;
/* 168 */ uint64 a7;
/* 176 */ uint64 s2;
/* 184 */ uint64 s3;
/* 192 */ uint64 s4;
/* 200 */ uint64 s5;
/* 208 */ uint64 s6;
/* 216 */ uint64 s7;
/* 224 */ uint64 s8;
/* 232 */ uint64 s9;
/* 240 */ uint64 s10;
/* 248 */ uint64 s11;
/* 256 */ uint64 t3;
/* 264 */ uint64 t4;
/* 272 */ uint64 t5;
/* 280 */ uint64 t6;
};
enum procstate { UNUSED, USED, SLEEPING, RUNNABLE, RUNNING, ZOMBIE };
// Per-process state
struct proc {
struct spinlock lock;
// p->lock must be held when using these:
enum procstate state; // Process state
void *chan; // If non-zero, sleeping on chan
int killed; // If non-zero, have been killed
int xstate; // Exit status to be returned to parent's wait
int pid; // Process ID
// wait_lock must be held when using this:
struct proc *parent; // Parent process
// these are private to the process, so p->lock need not be held.
uint64 kstack; // Virtual address of kernel stack
uint64 sz; // Size of process memory (bytes)
pagetable_t pagetable; // User page table
struct trapframe *trapframe; // data page for trampoline.S
struct context context; // swtch() here to run process
struct file *ofile[NOFILE]; // Open files
struct inode *cwd; // Current directory
char name[16]; // Process name (debugging)
};
allocproc.c函数如下,其作用是遍历进程数组,寻找一个unused
的进程槽,同时初始化进程号、进程状态、trapframe page、页表、上下文、栈,最后返回进程结构体指针:
// Look in the process table for an UNUSED proc.
// If found, initialize state required to run in the kernel,
// and return with p->lock held.
// If there are no free procs, or a memory allocation fails, return 0.
static struct proc*
allocproc(void)
{
struct proc *p;
for(p = proc; p < &proc[NPROC]; p++) {
acquire(&p->lock);
if(p->state == UNUSED) {
goto found;
} else {
release(&p->lock);
}
}
return 0;
found:
p->pid = allocpid();
p->state = USED;
// Allocate a trapframe page.
if((p->trapframe = (struct trapframe *)kalloc()) == 0){
freeproc(p);
release(&p->lock);
return 0;
}
// An empty user page table.
p->pagetable = proc_pagetable(p);
if(p->pagetable == 0){
freeproc(p);
release(&p->lock);
return 0;
}
// Set up new context to start executing at forkret,
// which returns to user space.
memset(&p->context, 0, sizeof(p->context));
p->context.ra = (uint64)forkret;
p->context.sp = p->kstack + PGSIZE;
return p;
}
uvminit()
函数:将initcode代码加载进页表,kalloc()
函数是在空闲的物理页中分配一个,返回一个void *的指针,指向申请到的物理页,mappages()
进行内存地址映射,将物理内存地址映射到虚拟内存地址,memmove()
函数将代码拷贝进页表。
// Load the user initcode into address 0 of pagetable,
// for the very first process.
// sz must be less than a page.
void
uvminit(pagetable_t pagetable, uchar *src, uint sz)
{
char *mem;
if(sz >= PGSIZE)
panic("inituvm: more than a page");
mem = kalloc();
memset(mem, 0, PGSIZE);
mappages(pagetable, 0, PGSIZE, (uint64)mem, PTE_W|PTE_R|PTE_X|PTE_U);
memmove(mem, src, sz);
}
然后userinit()
函数设置进程的pc和栈指针、进程名、进程工作的目录、进程状态设置为RUNNABLE,最后释放allocproc()
函数申请的锁。
看一下initcode.S
的内容
initcode将init
、argv
的地址加载进寄存器a0和a1,同时将系统调用exec的调用号加载进寄存器a7,然后发起ecall,上面的数据加载格式是riscv中ecall默认要求的a0-a5用于传递系统调用需要用到的参数,a7用于传递系统调用编号。
# Initial process that execs /init.
# This code runs in user space.
#include "syscall.h"
# exec(init, argv)
.globl start
start:
la a0, init
la a1, argv
li a7, SYS_exec
ecall
# for(;;) exit();
exit:
li a7, SYS_exit
ecall
jal exit
# char init[] = "/init\0";
init:
.string "/init\0"
# char *argv[] = { init, 0 };
.p2align 2
argv:
.long init
.long 0
init.c
中的内容如下:这是xv6启动的第一个用户进程,负责初始化控制台、设置标准I/O,同时启动一个shell。
// init: The initial user-level program
#include "kernel/types.h"
#include "kernel/stat.h"
#include "kernel/spinlock.h"
#include "kernel/sleeplock.h"
#include "kernel/fs.h"
#include "kernel/file.h"
#include "user/user.h"
#include "kernel/fcntl.h"
char *argv[] = { "sh", 0 };
int
main(void)
{
int pid, wpid;
if(open("console", O_RDWR) < 0){
mknod("console", CONSOLE, 0);
open("console", O_RDWR);
}
dup(0); // stdout
dup(0); // stderr
for(;;){
printf("init: starting sh\n");
pid = fork();
if(pid < 0){
printf("init: fork failed\n");
exit(1);
}
if(pid == 0){
exec("sh", argv);
printf("init: exec sh failed\n");
exit(1);
}
for(;;){
// this call to wait() returns if the shell exits,
// or if a parentless process exits.
wpid = wait((int *) 0);
if(wpid == pid){
// the shell exited; restart it.
break;
} else if(wpid < 0){
printf("init: wait returned an error\n");
exit(1);
} else {
// it was a parentless process; do nothing.
}
}
}
}
三、 创建一个系统调用
首先要明白一件事情,系统调用是在内核态下执行的,但是确实会被用户态下的进程使用,在xv6中会在用户态下对系统调用先进行一次封装,个人理解是这一层封装是为了提供便于接收参数的接口。
- 于是需要先在
user/user.h
中声明一个调用。 - 其次需要提前设置系统调用所需要的寄存器中的内容,然后通过ecall指令进入内核态进行系统调用,在xv6中,上面调用ecall的代码由
usys.pl
这个脚本自动生成,同时a0-a5的参数是c语言编译器自动将封装的接口的参数一一对应的放进去(也就是第一个参数对应a0寄存器,第二个参数对应a1寄存器,…)。 - 完成了用户态下系统调用的注册之后就需要进行内核中代码的实现,同样需要在内核中注册一下,先给系统调用一个编号,这一步在
syscall.h
中添加。 - 在分配完系统调用的编号之后,需要在
syscall.c
中进行注册。 - 完成注册之后就是实现系统调用的具体的逻辑了,这一步在
kernel
文件夹下创建一个文件即可,xv6中进行了一些分类,比如与进程相关的系统调用的实现在sysproc.c
中,与文件相关的在sysfile.c
中。
完成以上步骤一个完整的系统调用也就创建完成了,重新梳理一下过程即为:
关于具体如何获取用户态封装的函数上面说了是通过具体的寄存器,不过mit实现的xv6中在kernel/syscall.c
中帮我们封装了几个函数供我们调用。
其中argraw
是最底层的封装他直接展示最底层的内容。
static uint64
argraw(int n)
{
struct proc *p = myproc();
switch (n) {
case 0:
return p->trapframe->a0;
case 1:
return p->trapframe->a1;
case 2:
return p->trapframe->a2;
case 3:
return p->trapframe->a3;
case 4:
return p->trapframe->a4;
case 5:
return p->trapframe->a5;
}
panic("argraw");
return -1;
}
下面是其他实现。
// Fetch the uint64 at addr from the current process.
int
fetchaddr(uint64 addr, uint64 *ip)
{
struct proc *p = myproc();
if(addr >= p->sz || addr+sizeof(uint64) > p->sz)
return -1;
if(copyin(p->pagetable, (char *)ip, addr, sizeof(*ip)) != 0)
return -1;
return 0;
}
// Fetch the nth 32-bit system call argument.
int
argint(int n, int *ip)
{
*ip = argraw(n);
return 0;
}
// Retrieve an argument as a pointer.
// Doesn't check for legality, since
// copyin/copyout will do that.
int
argaddr(int n, uint64 *ip)
{
*ip = argraw(n);
return 0;
}
// Fetch the nth word-sized system call argument as a null-terminated string.
// Copies into buf, at most max.
// Returns string length if OK (including nul), -1 if error.
int
argstr(int n, char *buf, int max)
{
uint64 addr;
if(argaddr(n, &addr) < 0)
return -1;
return fetchstr(addr, buf, max);
}
四、 trace实现
有了上面的基础,接下来实现起来就更容易一些了。
首先在用户态下进行注册。
在
user/user.h
中声明系统调用// system calls int fork(void); int exit(int) __attribute__((noreturn)); int wait(int*); int pipe(int*); int write(int, const void*, int); int read(int, void*, int); int close(int); int kill(int); int exec(char*, char**); int open(const char*, int); int mknod(const char*, short, short); int unlink(const char*); int fstat(int fd, struct stat*); int link(const char*, const char*); int mkdir(const char*); int chdir(const char*); int dup(int); int getpid(void); char* sbrk(int); int sleep(int); int uptime(void); int trace(int); # our code int sysinfo(struct sysinfo *);
在usys.pl脚本中注册以便生成的usys.S包含sysinfo的具体实现
entry("fork"); entry("exit"); entry("wait"); entry("pipe"); entry("read"); entry("write"); entry("close"); entry("kill"); entry("exec"); entry("open"); entry("mknod"); entry("unlink"); entry("fstat"); entry("link"); entry("mkdir"); entry("chdir"); entry("dup"); entry("getpid"); entry("sbrk"); entry("sleep"); entry("uptime"); entry("trace"); entry("sysinfo");
内核态下申请系统调用编号
kernel/syscall.h
// System call numbers #define SYS_fork 1 #define SYS_exit 2 #define SYS_wait 3 #define SYS_pipe 4 #define SYS_read 5 #define SYS_kill 6 #define SYS_exec 7 #define SYS_fstat 8 #define SYS_chdir 9 #define SYS_dup 10 #define SYS_getpid 11 #define SYS_sbrk 12 #define SYS_sleep 13 #define SYS_uptime 14 #define SYS_open 15 #define SYS_write 16 #define SYS_mknod 17 #define SYS_unlink 18 #define SYS_link 19 #define SYS_mkdir 20 #define SYS_close 21 #define SYS_trace 22
kernel/syscall.c
中进行注册extern uint64 sys_trace(void); static uint64 (*syscalls[])(void) = { [SYS_fork] sys_fork, [SYS_exit] sys_exit, [SYS_wait] sys_wait, [SYS_pipe] sys_pipe, [SYS_read] sys_read, [SYS_kill] sys_kill, [SYS_exec] sys_exec, [SYS_fstat] sys_fstat, [SYS_chdir] sys_chdir, [SYS_dup] sys_dup, [SYS_getpid] sys_getpid, [SYS_sbrk] sys_sbrk, [SYS_sleep] sys_sleep, [SYS_uptime] sys_uptime, [SYS_open] sys_open, [SYS_write] sys_write, [SYS_mknod] sys_mknod, [SYS_unlink] sys_unlink, [SYS_link] sys_link, [SYS_mkdir] sys_mkdir, [SYS_close] sys_close, [SYS_trace] sys_trace, };
实现具体逻辑:首先需要在每个进程的结构体中记录一个mask信息,这个是需要用于判断是否该进程调用的系统调用需要输出,因为需要子进程调用系统的也需要输出,所以需要在fork的时候把mask信息从父进程传递给子进程,同时需要读取用户态传来的mask参数,实现具体的打印逻辑
在proc.h
中关于进程的结构体中添加了一个mask
字段// Per-process state struct proc { struct spinlock lock; // p->lock must be held when using these: enum procstate state; // Process state void *chan; // If non-zero, sleeping on chan int killed; // If non-zero, have been killed int xstate; // Exit status to be returned to parent's wait int pid; // Process ID int mask; // save the need trace syscall id // wait_lock must be held when using this: struct proc *parent; // Parent process // these are private to the process, so p->lock need not be held. uint64 kstack; // Virtual address of kernel stack uint64 sz; // Size of process memory (bytes) pagetable_t pagetable; // User page table struct trapframe *trapframe; // data page for trampoline.S struct context context; // swtch() here to run process struct file *ofile[NOFILE]; // Open files struct inode *cwd; // Current directory char name[16]; // Process name (debugging) };
kernel/proc.c
的fork实现中将mask从父进程传递给子进程// Create a new process, copying the parent. // Sets up child kernel stack to return as if from fork() system call. int fork(void) { int i, pid; struct proc *np; struct proc *p = myproc(); // Allocate process. if((np = allocproc()) == 0){ return -1; } // Copy user memory from parent to child. if(uvmcopy(p->pagetable, np->pagetable, p->sz) < 0){ freeproc(np); release(&np->lock); return -1; } np->sz = p->sz; # 父进程mask传递给子进程 np->mask = p->mask; // copy saved user registers. *(np->trapframe) = *(p->trapframe); // Cause fork to return 0 in the child. np->trapframe->a0 = 0; // increment reference counts on open file descriptors. for(i = 0; i < NOFILE; i++) if(p->ofile[i]) np->ofile[i] = filedup(p->ofile[i]); np->cwd = idup(p->cwd); safestrcpy(np->name, p->name, sizeof(p->name)); pid = np->pid; release(&np->lock); acquire(&wait_lock); np->parent = p; release(&wait_lock); acquire(&np->lock); np->state = RUNNABLE; release(&np->lock); return pid; }
kernel/syscall.c
实现具体逻辑,如果当前进程调用的系统调用编号是在mask对应的位上为1就输出该进程相关信息(进程编号、进程名、系统调用返回结果xv6默认存在寄存器a0中)const static char *syscalls_name[] = {"XXX", "fork", "exit", "wait", "pipe", "read", "kill", "exec", "fstat", "chdir", "dup", "getpid", "sbrk", "sleep", "uptime", "open", "write", "mknod", "unlink", "link", "mkdir", "close", "trace"}; void syscall(void) { int num; struct proc *p = myproc(); num = p->trapframe->a7; if(num > 0 && num < NELEM(syscalls) && syscalls[num]) { p->trapframe->a0 = syscalls[num](); if((1 << num) & p->mask) { printf("%d: syscall %s -> %d\n",p->pid , syscalls_name[num], p->trapframe->a0); } } else { printf("%d %s: unknown sys call %d\n", p->pid, p->name, num); p->trapframe->a0 = -1; } }
五、 sysinfo实现
如果完成了上面的trace系统调用,sysinfo也是同样的步骤。
首先在用户态下进行注册。
在
user/user.h
中声明系统调用int sysinfo(struct sysinfo *);
在usys.pl脚本中注册以便生成的usys.S包含sysinfo的具体实现
entry("sysinfo");
内核态下申请系统调用编号
kernel/syscall.h
// System call numbers #define SYS_sysinfo 23
kernel/syscall.c
中进行注册extern uint64 sys_trace(void); static uint64 (*syscalls[])(void) = { [SYS_fork] sys_fork, [SYS_exit] sys_exit, [SYS_wait] sys_wait, [SYS_pipe] sys_pipe, [SYS_read] sys_read, [SYS_kill] sys_kill, [SYS_exec] sys_exec, [SYS_fstat] sys_fstat, [SYS_chdir] sys_chdir, [SYS_dup] sys_dup, [SYS_getpid] sys_getpid, [SYS_sbrk] sys_sbrk, [SYS_sleep] sys_sleep, [SYS_uptime] sys_uptime, [SYS_open] sys_open, [SYS_write] sys_write, [SYS_mknod] sys_mknod, [SYS_unlink] sys_unlink, [SYS_link] sys_link, [SYS_mkdir] sys_mkdir, [SYS_close] sys_close, [SYS_trace] sys_trace, [SYS_sysinfo] sys_sysinfo, };
实现具体逻辑:这里是将两个空余的内存和not unused的进程数带回用户态,内核需要实现这两个的功能的函数,同时还需要将数据进行传递
在kernel/proc.c
中实现统计not unused进程的函数uint64 count_nproc(void) { struct proc *p; uint64 cnt = 0; for(p = proc; p < &proc[NPROC]; p ++) { if(p->state != UNUSED) cnt += 1; } return cnt; }
在
kernel/kalloc.c
中实现统计空余内存的函数uint64 freemem_size(void) { uint64 size = 0; struct run *r; acquire(&kmem.lock); r = kmem.freelist; while(r) { size += PGSIZE; r = r->next; } release(&kmem.lock); return size; }
在
kernel/sysfile.c
中实现从内核内存到用户内存数据的复制(因为两者虚拟内存map映射有差别,xv6那个教材说的),更具体而言就是copyout函数的使用,这个可以参考sysfile中的sys_fstat函数的使用。uint64 sys_sysinfo(void) { uint64 addr; // user pointer to struct stat if(argaddr(0, &addr) < 0) return -1; struct sysinfo info; info.freemem = freemem_size(); info.nproc = count_nproc(); struct proc *p = myproc(); if(copyout(p->pagetable, addr, (char *)&info, sizeof(struct sysinfo)) < 0) return -1; return 0; }
最后还有一个小的细节需要注意
kernel/sysino.h
中声明一下这两个实现的函数,不然xv6编译时会把warning当成error处理struct sysinfo { uint64 freemem; // amount of free memory (bytes) uint64 nproc; // number of process }; uint64 freemem_size(void); uint64 count_nproc(void);
六、 总结
至此就完成了lab2的所有内容了,做完lab2,我对xv6的系统调用过程有了一个更清楚、更具体的认知,同时也清楚了xv6是如何启动的,同时也了解了一点点页表和进程相关的知识,也算是承上启下为第三个lab3蓄势了。