CyberRT(apollo) IPC(shm)通信包重复/丢包 bug 及解决方案

发布于:2025-02-22 ⋅ 阅读:(12) ⋅ 点赞:(0)

这里写自定义目录标题

问题

前几年阅读了下CyberRT的源码,发现了一个进程间shm通信线程安全的问题,不过最后也没怎么用CyberRT就是了;
前段时间,apollo 发布10.0,看了下几年前的严重bug还是存在ORZ。
截止2025/02/20 Commit c48541b 仍未修复。
路径 apollo/cyber/transport/shm/condition_notifier.cc

conditon_notifier ,在CyberRT里面shm通信机制下使用,用于通知reader 监听新消息,以及writer 产生新消息进行notify;

关键代码如下


 struct Indicator {
    std::atomic<uint64_t> next_seq = {0};
    ReadableInfo infos[kBufLength];
    uint64_t seqs[kBufLength] = {0};
  };
  
bool ConditionNotifier::Notify(const ReadableInfo& info) {
  if (is_shutdown_.load()) {
    ADEBUG << "notifier is shutdown.";
    return false;
  }

  uint64_t seq = indicator_->next_seq.fetch_add(1);
  uint64_t idx = seq % kBufLength;
  indicator_->infos[idx] = info; // BUG
  indicator_->seqs[idx] = seq; // BUG

  return true;
}

bool ConditionNotifier::Listen(int timeout_ms, ReadableInfo* info) {
  if (info == nullptr) {
    AERROR << "info nullptr.";
    return false;
  }

  if (is_shutdown_.load()) {
    ADEBUG << "notifier is shutdown.";
    return false;
  }

  int timeout_us = timeout_ms * 1000;
  while (!is_shutdown_.load()) {
    uint64_t seq = indicator_->next_seq.load();
    if (seq != next_seq_) {
      auto idx = next_seq_ % kBufLength;
      auto actual_seq = indicator_->seqs[idx];
      if (actual_seq >= next_seq_) {
        next_seq_ = actual_seq;
        *info = indicator_->infos[idx];
        ++next_seq_;
        return true;
      } else {
        ADEBUG << "seq[" << next_seq_ << "] is writing, can not read now.";
      }
    }

    if (timeout_us > 0) {
      std::this_thread::sleep_for(std::chrono::microseconds(50));
      timeout_us -= 50;
    } else {
      return false;
    }
  }
  return false;
}

// 接收线程func
void ShmDispatcher::ThreadFunc() {
  ReadableInfo readable_info;
  while (!is_shutdown_.load()) {
  	// notify 为 conditon_notifier
    if (!notifier_->Listen(100, &readable_info)) {
      ADEBUG << "listen failed.";
      continue;
    }
   	XXXXXX 省略
    ReadMessage(channel_id, block_index);
  }
}

// 发送端
template <typename M>
bool ShmTransmitter<M>::Transmit(const M& msg, const MessageInfo& msg_info) {
  XXXXXX 省略
  return notifier_->Notify(readable_info);
}

从以上代码可知,apollo中通过next_seq通知接受端接受数据,同时next_seq为原子变量,自增线程安全;
但是一下两句并未原子操作,在c++ 内存模型中,多线程存在编译乱序、cpu乱序、缓存可见性等等问题,接收端线程对于infos和seqs的可见顺序有可能是乱序的,即seqs的修改先可见,然后再是infos的赋值。

indicator_->infos[idx] = info; // 该两句应当是原子或者顺序不重排
indicator_->seqs[idx] = seq; // 重排导致接受端错误接收一场数据

在接收端的接收处理中,则会因为乱序,可能读取到next_seq以及seqs[idx]变化,但infos[idx]仍然为旧数据的情况。

uint64_t seq = indicator_->next_seq.load();   
if (seq != next_seq_) {
  auto idx = next_seq_ % kBufLength;
  auto actual_seq = indicator_->seqs[idx];  //去读到正确数据 
  if (actual_seq >= next_seq_) {   // 判断通过
    next_seq_ = actual_seq;
    *info = indicator_->infos[idx]; // 写进程数据暂时不可见,读取到了旧数据
    ++next_seq_;
    return true;
  } else {
    ADEBUG << "seq[" << next_seq_ << "] is writing, can not read now.";
  }
}

即,发送端发送,1,2,3,4,5, 五个数据包;
接收端接收到,1,2,1,4,5, 五个数据包;
特别是在车端的常见arm架构弱内存模型会极其常见,为非常严重的bug。

解决方法

将seq变量改为原子变量,并引入的memory order里的acquire release 内存序

struct Indicator {
    std::atomic<uint64_t> next_seq = {0};
    ReadableInfo infos[kBufLength];
    uint64_t seqs[kBufLength] = {0};
    std::atomic<uint64_t> seqs[kBufLength] = {0};
  };
bool ConditionNotifier::Notify(const ReadableInfo& info) {
  if (is_shutdown_.load()) {
    ADEBUG << "notifier is shutdown.";
    return false;
  }

  uint64_t seq = indicator_->next_seq.fetch_add(1);
  uint64_t idx = seq % kBufLength;
  indicator_->infos[idx] = info;
  indicator_->seqs[idx].store(seq, std::memory_order_release);  // infos赋值 先序于seqs赋值
  return true;
}

bool ConditionNotifier::Listen(int timeout_ms, ReadableInfo* info) {
  if (info == nullptr) {
    AERROR << "info nullptr.";
    return false;
  }

  if (is_shutdown_.load()) {
    ADEBUG << "notifier is shutdown.";
    return false;
  }

  int timeout_us = timeout_ms * 1000;
  while (!is_shutdown_.load()) {
    uint64_t seq = indicator_->next_seq.load();
    if (seq != next_seq_) {
      auto idx = next_seq_ % kBufLength;
      auto actual_seq = indicator_->seqs[idx].load(std::memory_order_acquire);
      if (actual_seq >= next_seq_) {
        next_seq_ = actual_seq;
        *info = indicator_->infos[idx];
        ++next_seq_;
        return true;
      } else {
        ADEBUG << "seq[" << next_seq_ << "] is writing, can not read now.";
      }
    }

    if (timeout_us > 0) {
      std::this_thread::sleep_for(std::chrono::microseconds(50));
      timeout_us -= 50;
    } else {
      return false;
    }
  }
  return false;
}