mmap映射物理内存之二clean cache-EW帮帮网

   unsigned long page;    
    int ret;
    unsigned long pfn_start = (USR_DATA_MEM_BASE>> PAGE_SHIFT) + vma->vm_pgoff;
    unsigned long size = (unsigned long)(vma->vm_end - vma->vm_start); 
    

    printk("phy: 0x%llx,  size: 0x%lx\n",vma->vm_pgoff,  size);

    // vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);


    ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, vma->vm_page_prot);
    if (ret)
    {
        printk("%s: remap_pfn_range failed at [0x%lx  0x%lx]\n",
                __func__, vma->vm_start, vma->vm_end);
        return -1;

    }

随机偏移量和测试大小的生成

#define  MAP_MAX_SIZE   (0x8000000)   //128M 

#define   CACHE_LINE_SIZE		(64)

#define TEST_BLOCK_MAX_SIZE                 (2 << 20U)    /* 最大2MB，同时必须64字节对齐 */
#define TEST_BLOCK_ALIGNMENT			CACHE_LINE_SIZE             /* ql45的cache line是64字节 */
#define TEST_BLOCK_ALIGNMENT_MASK		(~(TEST_BLOCK_ALIGNMENT - 1))
#define TEST_BLOCK_ADDR_ALIGNMENT		CACHE_LINE_SIZE		 /* 地址也必须64B对齐 */

/* set random seed */
srand((int)time(NULL));
		
		/* 随机生成一个偏移，范围0~62MB */
		gen_addr_off = ((unsigned int)rand())%(MAP_MAX_SIZE - TEST_BLOCK_MAX_SIZE);
		gen_addr_off = (gen_addr_off + TEST_BLOCK_ALIGNMENT - 1) & TEST_BLOCK_ALIGNMENT_MASK;
		
		/* 随机生成一个大小，范围64 ~ 2MB，这里选64也是为了64字节对齐 */
		gen_test_size = ((unsigned int)rand())%TEST_BLOCK_MAX_SIZE;
		gen_test_size = (gen_test_size + TEST_BLOCK_ALIGNMENT - 1) & TEST_BLOCK_ALIGNMENT_MASK;
		if(!gen_test_size)
			gen_test_size = CACHE_LINE_SIZE;

clean cache

这里采用aarch64 的清cache指令进行测试，查看是否可以。

 for (i = 0; i < gen_test_size; i = i + 64)
       {
        temp_addr=(unsigned char *)((unsigned char *)test_virt_addr_w+i);
        __asm volatile("dc cvac,%0"::"r"(temp_addr));
     
		
       }

测试流程及数据

测试数据大小为32MB

不调用clean cache

即写程序写入数据后，读程序立即读取，比较。

例如我们写入 0x5a，但是读程序读到0，且是从偏移量为0就没有读对。这说明我们设计的程序可以模拟cache的控制。

After flush, the data for clean is not correct!
index:0,data: 0x0

调用clean cache操作

在写入数据后，调用clean cache的代码。不再提示数据不对，而是反复测试中。这说明clean cache操作生效了。

test size: 1522 KB,cost time : 1995.000000
test size: 1522 KB,cost time : 1584.000000

修改clean cache的步长

前面操作是按照64 cache line进行的，修改步长为页大小4K

for (i = 0; i < gen_test_size; i = i + 4096)
       {
        temp_addr=(unsigned char *)((unsigned char *)test_virt_addr_w+i);
        __asm volatile("dc cvac,%0"::"r"(temp_addr));
     
		
       }

则提示如下信息：我们读取数据是4字节操作，索引16即第二个cache line的地方。也就是步长增加到4K后，第二个cache line位置的数据实际并没有被刷新到cache中。这种操作显然不行。

After flush, the data for clean is not correct!
index:16,data: 0x0

32M数据clean

malloc Read back: 0x0, copy (32)M 40970.000000 us

lock +clean

malloc Read back: 0x0, copy (32)M 33757.000000 us

malloc 不带clean cache，少了 27ms，也就是32M数据cache clean，大概27ms

mmap+cache+clean cache

Read back: 0x0, copy (32)M 43951.000000 us

mmap+cache

Read back: 0x0, copy (32)M 17313.000000 us

mmap加clean cache，增加 26MS

内存及操作	耗时（毫秒）
malloc +clean cache	40
malloc+clean cache+lock	33
mmap+copy+cache clean	44
mmap+copy	17

总体而言，32MB 数据，clean cache一次大概需要26MS.

clean时间优化

多个clean cache line的有效性

2个

test size: 1258 KB,cost time : 1094.000000 us
test size: 1258 KB,cost time : 756.000000 us
test size: 1751 KB,cost time : 1501.000000 us
test size: 1751 KB,cost time : 1042.000000 us
test size: 1081 KB,cost time : 941.000000 us
test size: 1081 KB,cost time : 656.000000 us

4个

test size: 1293 KB,cost time : 809.000000 us
test size: 1293 KB,cost time : 476.000000 us
test size: 992 KB,cost time : 623.000000 us
test size: 992 KB,cost time : 368.000000 us

32M数据耗时

一次4个clean cache line操作

    for (i = 0; i < 0x2000000; i = i + 256)
    {
        temp_addr=(unsigned char *)((unsigned char *)mem+i);
        //__asm volatile("dc cvac,%0"::"r"(temp_addr));
        __asm volatile("dc cvac,%0\n" "dc cvac, %1\n" "dc cvac, %2\n" ::"r"(temp_addr),"r"(temp_addr+64),"r"(temp_addr+128),"r"(temp_addr+192): "memory");
    }

Read back: 0x0, copy (32)M 25114.000000 us

一次6个clean cache line操作

Read back: 0x0, copy (32)M 22968.000000 us

内存及操作	耗时（毫秒）
mmap+copy	17
mmap+copy+cache clean	44
mmap+copy+cache clean（2指令）	31
mmap+copy+cache clean（4指令）	25
mmap+copy+cache clean（6指令）	23

通过上述数据，可以看到通过多指令，clean的耗时，从26MS减少到6MS

总结

通过映射同一段物理内存，模拟外设DMA与CPU读写内存操作。验证cache操作的有效性，进而分析优化其耗时。

后续通过NEON相关指令再做进一步优化。整体实验均基于单线程运行、单核CPU上进行。单核并不能占满内存带宽，如果需要更快的速率，还可以考虑多核分块对数据处理。

mmap映射物理内存之二clean cache

clean cache操作及有效性确认

过程设计

关键程序设计

开cache映射