Linux I/O 访问架构深入分析
目录
- 概述
- I/O 架构层次
- 核心数据结构
- I/O 处理流程
- VFS 虚拟文件系统
- 块设备I/O
- 字符设备I/O
- 内存映射I/O
- 异步I/O机制
- I/O调度器
- 调试工具与方法
- 性能优化策略
概述
Linux I/O 系统是一个多层次、高度抽象的架构,旨在为应用程序提供统一的文件访问接口,同时支持各种不同类型的存储设备和文件系统。
I/O 架构层次
架构分层表
层次 |
组件 |
主要功能 |
关键数据结构 |
用户空间 |
应用程序 |
文件操作API调用 |
FILE*, fd |
系统调用 |
内核入口 |
参数验证、权限检查 |
system_call table |
VFS层 |
虚拟文件系统 |
统一文件接口抽象 |
inode, dentry, file |
文件系统层 |
ext4/xfs/btrfs等 |
具体文件系统实现 |
super_block, inode_operations |
页缓存层 |
Page Cache |
I/O缓存和优化 |
address_space, page |
块设备层 |
Block Layer |
块设备I/O管理 |
bio, request, request_queue |
设备驱动层 |
驱动程序 |
硬件抽象接口 |
block_device_operations |
硬件层 |
存储设备 |
物理存储介质 |
硬件寄存器、DMA |
核心数据结构
文件系统核心结构
struct file {
struct path f_path;
struct inode *f_inode;
const struct file_operations *f_op;
spinlock_t f_lock;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
u64 f_version;
void *private_data;
struct address_space *f_mapping;
};
struct inode {
umode_t i_mode;
unsigned short i_opflags;
kuid_t i_uid;
kgid_t i_gid;
unsigned int i_flags;
const struct inode_operations *i_op;
struct super_block *i_sb;
struct address_space *i_mapping;
void *i_security;
unsigned long i_ino;
dev_t i_rdev;
loff_t i_size;
struct timespec64 i_atime;
struct timespec64 i_mtime;
struct timespec64 i_ctime;
spinlock_t i_lock;
unsigned short i_bytes;
u8 i_blkbits;
blkcnt_t i_blocks;
const struct file_operations *i_fop;
struct hlist_head i_dentry;
struct rw_semaphore i_rwsem;
union {
struct pipe_inode_info *i_pipe;
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
};
};
struct file_operations {
struct module *owner;
loff_t (*llseek)(struct file *, loff_t, int);
ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter)(struct kiocb *, struct iov_iter *);
ssize_t (*write_iter)(struct kiocb *, struct iov_iter *);
int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate)(struct file *, struct dir_context *);
int (*iterate_shared)(struct file *, struct dir_context *);
__poll_t (*poll)(struct file *, struct poll_table_struct *);
long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
int (*mmap)(struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
int (*open)(struct inode *, struct file *);
int (*flush)(struct file *, fl_owner_t id);
int (*release)(struct inode *, struct file *);
int (*fsync)(struct file *, loff_t, loff_t, int datasync);
int (*fasync)(int, struct file *, int);
int (*lock)(struct file *, int, struct file_lock *);
ssize_t (*sendpage)(struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock)(struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
};
块设备I/O核心结构
struct bio {
struct bio *bi_next;
struct gendisk *bi_disk;
unsigned int bi_opf;
unsigned short bi_flags;
unsigned short bi_ioprio;
unsigned short bi_write_hint;
blk_status_t bi_status;
u8 bi_partno;
atomic_t __bi_remaining;
struct bvec_iter bi_iter;
bio_end_io_t *bi_end_io;
void *bi_private;
struct bio_crypt_ctx *bi_crypt_context;
struct bio_integrity_payload *bi_integrity;
unsigned short bi_vcnt;
unsigned short bi_max_vecs;
atomic_t __bi_cnt;
struct bio_vec *bi_io_vec;
struct bio_set *bi_pool;
struct bio_vec bi_inline_vecs[];
};
struct request {
struct request_queue *q;
struct blk_mq_ctx *mq_ctx;
struct blk_mq_hw_ctx *mq_hctx;
unsigned int cmd_flags;
req_flags_t rq_flags;
int tag;
int internal_tag;
sector_t __sector;
unsigned int __data_len;
struct bio *bio;
struct bio *biotail;
struct hlist_node hash;
union {
struct rb_node rb_node;
struct bio_vec special_vec;
};
union {
struct hd_struct *part;
int margin_lvl;
};
unsigned long deadline;
struct list_head timeout_list;
unsigned int timeout;
int retries;
rq_end_io_fn *end_io;
void *end_io_data;
};
I/O 处理流程
系统调用到设备驱动的数据流
read系统调用详细流程
VFS 虚拟文件系统
VFS 架构关系图
VFS核心操作表
操作类型 |
结构体 |
主要函数 |
功能描述 |
文件操作 |
file_operations |
read, write, open, release |
文件I/O操作 |
inode操作 |
inode_operations |
create, lookup, mkdir, rmdir |
文件系统对象操作 |
地址空间操作 |
address_space_operations |
readpage, writepage, direct_IO |
页缓存操作 |
超级块操作 |
super_operations |
alloc_inode, destroy_inode, sync_fs |
文件系统级操作 |
目录项操作 |
dentry_operations |
d_revalidate, d_hash, d_compare |
目录缓存操作 |
块设备I/O
块设备I/O架构
BIO生命周期
字符设备I/O
字符设备架构
struct cdev {
struct kobject kobj;
struct module *owner;
const struct file_operations *ops;
struct list_head list;
dev_t dev;
unsigned int count;
};
static struct file_operations globalmem_fops = {
.owner = THIS_MODULE,
.llseek = globalmem_llseek,
.read = globalmem_read,
.write = globalmem_write,
.unlocked_ioctl = globalmem_ioctl,
.open = globalmem_open,
.release = globalmem_release,
};
字符设备I/O流程
内存映射I/O
mmap机制
mmap系统调用流程
static int globalmem_mmap(struct file *filp, struct vm_area_struct *vma)
{
unsigned long size = vma->vm_end - vma->vm_start;
if (size > GLOBALMEM_SIZE)
return -EINVAL;
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
if (remap_pfn_range(vma, vma->vm_start,
virt_to_phys(globalmem_devp->mem) >> PAGE_SHIFT,
size, vma->vm_page_prot))
return -EAGAIN;
return 0;
}
异步I/O机制
AIO架构
io_uring新机制
I/O调度器
调度器对比表
调度器 |
特点 |
适用场景 |
算法复杂度 |
noop |
简单FIFO |
SSD、虚拟化环境 |
O(1) |
deadline |
截止时间保证 |
实时系统 |
O(log n) |
cfq |
完全公平队列 |
多用户环境 |
O(log n) |
bfq |
预算公平队列 |
交互式应用 |
O(log n) |
kyber |
多队列优化 |
高性能SSD |
O(1) |
CFQ调度器算法
调试工具与方法
系统I/O监控工具
工具名称 |
功能描述 |
使用场景 |
输出信息 |
iostat |
I/O统计信息 |
性能监控 |
IOPS、吞吐量、延迟 |
iotop |
进程I/O排序 |
问题定位 |
每进程I/O使用率 |
blktrace |
块设备跟踪 |
深度分析 |
I/O请求路径 |
strace |
系统调用跟踪 |
调试 |
系统调用序列 |
perf |
性能分析 |
优化 |
CPU、I/O热点 |
ftrace |
内核函数跟踪 |
内核调试 |
函数调用链 |
常用调试命令
iostat -x 1
iotop -o
vmstat 1
blktrace -d /dev/sda -o trace
blkparse trace.blktrace.0
cat /proc/PID/io
lsof +D /path
echo 1 > /sys/kernel/debug/tracing/events/block/enable
cat /sys/kernel/debug/tracing/trace
cat /proc/meminfo | grep -E "(Cached|Buffers|Dirty)"
echo 3 > /proc/sys/vm/drop_caches
df -h
mount | column -t
tune2fs -l /dev/sda1
性能分析脚本
#!/bin/bash
echo "=== I/O Performance Analysis ==="
echo "1. Basic I/O Statistics:"
iostat -x 1 5
echo "2. Top I/O Processes:"
iotop -a -o -d 1 -n 5
echo "3. Disk Usage:"
df -h
echo "4. Memory and Cache Status:"
free -h
cat /proc/meminfo | grep -E "(Cached|Buffers|Dirty|Writeback)"
echo "5. File Descriptor Usage:"
cat /proc/sys/fs/file-nr
echo "6. I/O Scheduler:"
for dev in /sys/block/*/queue/scheduler; do
echo "$dev: $(cat $dev)"
done
内核调试技术
#define DEBUG_IO 1
#if DEBUG_IO
#define io_debug(fmt, ...) \
printk(KERN_DEBUG "IO_DEBUG: " fmt, ##__VA_ARGS__)
#else
#define io_debug(fmt, ...)
#endif
#include <linux/tracepoint.h>
TRACE_EVENT(my_io_event,
TP_PROTO(struct file *file, size_t count, loff_t pos),
TP_ARGS(file, count, pos),
TP_STRUCT__entry(
__field(unsigned long, inode)
__field(size_t, count)
__field(loff_t, pos)
),
TP_fast_assign(
__entry->inode = file->f_inode->i_ino;
__entry->count = count;
__entry->pos = pos;
),
TP_printk("inode=%lu count=%zu pos=%lld",
__entry->inode, __entry->count, __entry->pos)
);
#define pr_debug_io(fmt, ...) \
pr_debug("IO: " fmt, ##__VA_ARGS__)
static ssize_t my_read(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
io_debug("Read request: count=%zu, pos=%lld\n", count, *ppos);
trace_my_io_event(filp, count, *ppos);
pr_debug_io("Processing read for inode %lu\n", filp->f_inode->i_ino);
return count;
}
性能优化策略
I/O优化技术对比
优化技术 |
原理 |
适用场景 |
性能提升 |
页缓存预读 |
预先加载后续页面 |
顺序访问 |
2-10x |
异步I/O |
非阻塞I/O操作 |
高并发应用 |
5-50x |
直接I/O |
绕过页缓存 |
大文件传输 |
20-30% |
内存映射 |
避免数据拷贝 |
随机访问 |
10-50% |
批量I/O |
合并多个请求 |
小块I/O |
2-5x |
I/O调度优化 |
减少磁盘寻道 |
机械硬盘 |
20-100% |
优化配置示例
echo mq-deadline > /sys/block/sda/queue/scheduler
echo 4096 > /sys/block/sda/queue/read_ahead_kb
echo 128 > /sys/block/sda/queue/nr_requests
echo 10 > /proc/sys/vm/swappiness
echo 1 > /proc/sys/vm/zone_reclaim_mode
mount -o remount,noatime,nodiratime /
应用层优化建议
int fd = open("largefile.dat", O_RDONLY | O_DIRECT);
posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
madvise(addr, length, MADV_WILLNEED);
struct iovec iov[MAX_IOV];
writev(fd, iov, iovcnt);
struct aiocb cb;
aio_read(&cb);
aio_suspend(&cb, 1, NULL);
总结
Linux I/O架构是一个复杂而精密的系统,通过多层抽象和优化技术,为应用程序提供了高效、统一的存储访问接口。理解其工作原理和掌握相关的调试技术,对于系统性能优化和问题诊断具有重要意义。
关键要点
- 分层架构:VFS提供统一接口,底层支持多种文件系统和设备类型
- 缓存机制:页缓存显著提升I/O性能,但需要合理管理
- 异步处理:现代I/O栈大量使用异步机制减少延迟
- 调度优化:不同的I/O调度器适用于不同的应用场景
- 性能监控:丰富的工具链支持深度性能分析和问题诊断
通过深入理解这些机制并合理应用优化技术,可以显著提升系统的I/O性能和响应能力。