提示词
编写c程序,实现读取xml的row和c标签信息,并输出到csv格式,要求是:数值型c,输出标签的内容,字符串型c(t = “inlineStr”),输出的内容,row的r属性表是行号,c的r属性是字母列号行号,如果不连续,就在中间补空列,用逗号分隔各列,一行完全输出后换行,如果1行全为空,不输出该行,为便于查对,每行开头输出row的r属性行号,利用内存映射文件实现。只输出代码,不干别的。
他输出了如下程序
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
// 字母列号转数字,A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {
int num = 0;
for (int i = 0; i < len; i++) {
if (col[i] >= 'A' && col[i] <= 'Z')
num = num * 26 + (col[i] - 'A' + 1);
else if (col[i] >= 'a' && col[i] <= 'z')
num = num * 26 + (col[i] - 'a' + 1);
else
break;
}
return num - 1;
}
// 数字转列字母
void num_to_col_letter(int num, char* out) {
int len = 0;
int tmp = num + 1;
char stack[10];
int sp = 0;
while (tmp) {
int rem = (tmp - 1) % 26;
stack[sp++] = rem + 'A';
tmp = (tmp - 1) / 26;
}
while (sp--) out[len++] = stack[sp];
out[len] = 0;
}
// 获取属性值
const char* get_prop(xmlNodePtr node, const char* name) {
xmlChar* val = xmlGetProp(node, (const xmlChar*)name);
if (!val) return NULL;
return (const char*)val;
}
// 解析c节点的值
const char* parse_c_value(xmlNodePtr cnode) {
xmlNodePtr child = cnode->children;
const char* t_prop = get_prop(cnode, "t");
if (t_prop && strcmp(t_prop, "inlineStr") == 0) {
while (child) {
if (child->type == XML_ELEMENT_NODE && strcmp((const char*)child->name, "t") == 0) {
xmlChar* tval = xmlNodeGetContent(child);
return (const char*)tval;
}
child = child->next;
}
return "";
} else {
while (child) {
if (child->type == XML_ELEMENT_NODE && strcmp((const char*)child->name, "v") == 0) {
xmlChar* vval = xmlNodeGetContent(child);
return (const char*)vval;
}
child = child->next;
}
return "";
}
}
int main(int argc, char* argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <input.xml>\n", argv[0]);
exit(1);
}
const char* fname = argv[1];
int fd = open(fname, O_RDONLY);
if (fd < 0) { perror("open"); exit(1); }
struct stat sb;
if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }
size_t flen = sb.st_size;
char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }
//printf("mmap");
// 初始化libxml2
xmlInitParser();
// 解析内存中的xml
xmlDocPtr doc = xmlParseMemory(mapped, flen);
if (!doc) {
fprintf(stderr, "xmlParseMemory error\n");
exit(1);
}
xmlNodePtr root = xmlDocGetRootElement(doc);
// 存储所有行
char*** rows = NULL;
int* row_nums = NULL;
int row_count = 0;
int max_row = 0, max_col = 0;
//printf("begin ");
for (xmlNodePtr node = root; node; node = node->next) {
if (node->type == XML_ELEMENT_NODE && strcmp((const char*)node->name, "sheetData") == 0) {//printf("sheetData ");
for (xmlNodePtr row_node = node->children; row_node; row_node = row_node->next) {
if (row_node->type == XML_ELEMENT_NODE && strcmp((const char*)row_node->name, "row") == 0) {//printf("row ");
const char* r_prop = get_prop(row_node, "r");
if (!r_prop) continue;
int row_num = atoi(r_prop);
if (row_num < 1) continue;
// 扩展rows
if (row_num > max_row) {
rows = (char***)realloc(rows, sizeof(char**) * (row_num + 1));
row_nums = (int*)realloc(row_nums, sizeof(int) * (row_num + 1));
for (int i = max_row + 1; i <= row_num; i++) {
rows[i] = NULL;
row_nums[i] = 0;
}
max_row = row_num;
}
// 统计本行最大列
int this_max_col = -1;
for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
const char* r_attr = get_prop(cnode, "r");
if (!r_attr) continue;
int col_len = 0;
while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
int col_num = col_letter_to_num(r_attr, col_len);
if (col_num > this_max_col) this_max_col = col_num;
}
}
if (this_max_col < 0) continue;
if (this_max_col > max_col) max_col = this_max_col;
// 初始化本行
rows[row_num] = (char**)calloc((this_max_col + 1), sizeof(char*));
row_nums[row_num] = row_num;
// 遍历c节点
for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {
if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {
const char* r_attr = get_prop(cnode, "r");
if (!r_attr) continue;
int col_len = 0;
while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;
int col_num = col_letter_to_num(r_attr, col_len);
if (col_num < 0) continue;
const char* val = parse_c_value(cnode);
rows[row_num][col_num] = strdup(val ? val : "");
//if (val && (val != (const char*)xmlNodeGetContent(cnode))) xmlFree((void*)val);
}
}
row_count++;
}
}
}
}
// 输出到stdout
for (int r = 1; r <= max_row; r++) {
if (!rows[r]) continue;
int has_data = 0;
for (int c = 0; c <= max_col; c++) {
if (rows[r][c] && strlen(rows[r][c]) > 0) {
has_data = 1; break;
}
}
if (!has_data) continue;
// 输出行号
printf("%d", r);
for (int c = 0; c <= max_col; c++) {
if (rows[r][c] && strlen(rows[r][c]) > 0)
printf(",%s", rows[r][c]);
else
printf(",");
}
printf("\n");
}
//return 0;
// 释放内存
for (int r = 1; r <= max_row; r++) {
if (rows[r]) {
for (int c = 0; c <= max_col; c++) {
if (rows[r][c]) free(rows[r][c]);
}
free(rows[r]);
}
}
if (rows) free(rows);
if (row_nums) free(row_nums);
xmlFreeDoc(doc);
xmlCleanupParser();
munmap(mapped, flen);
close(fd);
return 0;
}
查看#include头文件可知,它调用了libxml2库,这是一个DOM类型的库
用whereis命令查看,
whereis libxml2.so
libxml2.so: /usr/lib/aarch64-linux-gnu/libxml2.so
whereis tree.h
tree.h:
我的系统里有so文件,但是没有头文件,可能是安装不全所致,所以下载源代码 ,将里面的include目录下内容解压到当前目录。
然后用如下命令行编译,执行。一开始报错,可能与错误释放内存映射文件有关,后来将相关的释放内存代码xmlFree注释掉,就不报错了。
gcc catxml.c -I . -lxml2 -o catxml
./catxml sheet13.xml
munmap_chunk(): invalid pointer
Aborted (core dumped)
./catxml sheet13.xml
1,1,15519,785,1,17.00,24386.67,0.04,0.02,,,35137.0,35107.0,35146.0,,,
2,1,6731,732,2,36.00,58958.28,0.09,0.06,,,35167.0,35123.0,35175.0,,,
time ./catxml /par/lineitem/xl/worksheets/sheet1.xml >catsheet1.csv
^C
real 1m7.394s
user 0m7.728s
sys 0m5.088s
输出小文件正常,输出700MB的大文件就不行了。