利用美团龙猫用libxml2编写XML转CSV文件C程序

发布于:2025-09-13 ⋅ 阅读:(20) ⋅ 点赞:(0)

提示词

编写c程序,实现读取xml的row和c标签信息,并输出到csv格式,要求是:数值型c,输出标签的内容,字符串型c(t = “inlineStr”),输出的内容,row的r属性表是行号,c的r属性是字母列号行号,如果不连续,就在中间补空列,用逗号分隔各列,一行完全输出后换行,如果1行全为空,不输出该行,为便于查对,每行开头输出row的r属性行号,利用内存映射文件实现。只输出代码,不干别的。

他输出了如下程序

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <libxml/parser.h>
#include <libxml/tree.h>

// 字母列号转数字,A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {
    int num = 0;
    for (int i = 0; i < len; i++) {
        if (col[i] >= 'A' && col[i] <= 'Z')
            num = num * 26 + (col[i] - 'A' + 1);
        else if (col[i] >= 'a' && col[i] <= 'z')
            num = num * 26 + (col[i] - 'a' + 1);
        else
            break;
    }
    return num - 1;
}

// 数字转列字母
void num_to_col_letter(int num, char* out) {
    int len = 0;
    int tmp = num + 1;
    char stack[10];
    int sp = 0;
    while (tmp) {
        int rem = (tmp - 1) % 26;
        stack[sp++] = rem + 'A';
        tmp = (tmp - 1) / 26;
    }
    while (sp--) out[len++] = stack[sp];
    out[len] = 0;
}

// 获取属性值
const char* get_prop(xmlNodePtr node, const char* name) {
    xmlChar* val = xmlGetProp(node, (const xmlChar*)name);
    if (!val) return NULL;
    return (const char*)val;
}

// 解析c节点的值
const char* parse_c_value(xmlNodePtr cnode) {
    xmlNodePtr child = cnode->children;
    const char* t_prop = get_prop(cnode, "t");
    if (t_prop && strcmp(t_prop, "inlineStr") == 0) {
        while (child) {
            if (child->type == XML_ELEMENT_NODE && strcmp((const char*)child->name, "t") == 0) {
                xmlChar* tval = xmlNodeGetContent(child);
                return (const char*)tval;
            }
            child = child->next;
        }
        return "";
    } else {
        while (child) {
            if (child->type == XML_ELEMENT_NODE && strcmp((const char*)child->name, "v") == 0) {
                xmlChar* vval = xmlNodeGetContent(child);
                return (const char*)vval;
            }
            child = child->next;
        }
        return "";
    }
}

int main(int argc, char* argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <input.xml>\n", argv[0]);
        exit(1);
    }
    const char* fname = argv[1];
    int fd = open(fname, O_RDONLY);
    if (fd < 0) { perror("open"); exit(1); }
    struct stat sb;
    if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }
    size_t flen = sb.st_size;

    char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
    if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }
    //printf("mmap");


    // 初始化libxml2
    xmlInitParser();

    // 解析内存中的xml
    xmlDocPtr doc = xmlParseMemory(mapped, flen);
    if (!doc) {
        fprintf(stderr, "xmlParseMemory error\n");
        exit(1);
    }
    xmlNodePtr root = xmlDocGetRootElement(doc);

    // 存储所有行
    char*** rows = NULL;
    int* row_nums = NULL;
    int row_count = 0;
    int max_row = 0, max_col = 0;
    //printf("begin ");
    for (xmlNodePtr node = root; node; node = node->next) {
        if (node->type == XML_ELEMENT_NODE && strcmp((const char*)node->name, "sheetData") == 0) {//printf("sheetData ");
            for (xmlNodePtr row_node = node->children; row_node; row_node = row_node->next) {
                if (row_node->type == XML_ELEMENT_NODE && strcmp((const char*)row_node->name, "row") == 0) {//printf("row ");
                    const char* r_prop = get_prop(row_node, "r");
                    if (!r_prop) continue;
                    int row_num = atoi(r_prop);
                    if (row_num < 1) continue;
                    // 扩展rows
                    if (row_num > max_row) {
                        rows = (char***)realloc(rows, sizeof(char**) * (row_num + 1));
                        row_nums = (int*)realloc(row_nums, sizeof(int) * (row_num + 1));
                        for (int i = max_row + 1; i <= row_num; i++) {
                            rows[i] = NULL;
                            row_nums[i] = 0;
                        }
                        max_row = row_num;
                    }
                    // 统计本行最大列
                    int this_max_col = -1;
                    for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
                        if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
                            const char* r_attr = get_prop(cnode, "r");
                            if (!r_attr) continue;
                            int col_len = 0;
                            while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
                            int col_num = col_letter_to_num(r_attr, col_len);
                            if (col_num > this_max_col) this_max_col = col_num;
                        }
                    }
                    if (this_max_col < 0) continue;
                    if (this_max_col > max_col) max_col = this_max_col;

                    // 初始化本行
                    rows[row_num] = (char**)calloc((this_max_col + 1), sizeof(char*));
                    row_nums[row_num] = row_num;

                    // 遍历c节点
                    for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
                        if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
                            const char* r_attr = get_prop(cnode, "r");
                            if (!r_attr) continue;
                            int col_len = 0;
                            while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
                            int col_num = col_letter_to_num(r_attr, col_len);
                            if (col_num < 0) continue;
                            const char* val = parse_c_value(cnode);
                            rows[row_num][col_num] = strdup(val ? val : "");
                            //if (val && (val != (const char*)xmlNodeGetContent(cnode))) xmlFree((void*)val);
                        }
                    }
                    row_count++;
                }
            }
        }
    }

    // 输出到stdout
    for (int r = 1; r <= max_row; r++) {
        if (!rows[r]) continue;
        int has_data = 0;
        for (int c = 0; c <= max_col; c++) {
            if (rows[r][c] && strlen(rows[r][c]) > 0) {
                has_data = 1; break;
            }
        }
        if (!has_data) continue;

        // 输出行号
        printf("%d", r);
        for (int c = 0; c <= max_col; c++) {
            if (rows[r][c] && strlen(rows[r][c]) > 0)
                printf(",%s", rows[r][c]);
            else
                printf(",");
        }
        printf("\n");
    }
//return 0;
    // 释放内存
    for (int r = 1; r <= max_row; r++) {
        if (rows[r]) {
            for (int c = 0; c <= max_col; c++) {
                if (rows[r][c]) free(rows[r][c]);
            }
            free(rows[r]);
        }
    }
    if (rows) free(rows);
    if (row_nums) free(row_nums);
    xmlFreeDoc(doc);
    xmlCleanupParser();
    munmap(mapped, flen);
    close(fd);
    return 0;
}

查看#include头文件可知,它调用了libxml2库,这是一个DOM类型的库
用whereis命令查看,

whereis libxml2.so
libxml2.so: /usr/lib/aarch64-linux-gnu/libxml2.so
whereis tree.h    
tree.h:

我的系统里有so文件,但是没有头文件,可能是安装不全所致,所以下载源代码 ,将里面的include目录下内容解压到当前目录。
然后用如下命令行编译,执行。一开始报错,可能与错误释放内存映射文件有关,后来将相关的释放内存代码xmlFree注释掉,就不报错了。

gcc catxml.c -I . -lxml2 -o catxml
./catxml sheet13.xml
munmap_chunk(): invalid pointer
Aborted (core dumped)
./catxml sheet13.xml
1,1,15519,785,1,17.00,24386.67,0.04,0.02,,,35137.0,35107.0,35146.0,,,
2,1,6731,732,2,36.00,58958.28,0.09,0.06,,,35167.0,35123.0,35175.0,,,

time ./catxml  /par/lineitem/xl/worksheets/sheet1.xml >catsheet1.csv
^C

real	1m7.394s
user	0m7.728s
sys	0m5.088s

输出小文件正常,输出700MB的大文件就不行了。


网站公告

今日签到

点亮在社区的每一天
去签到