第1章 当前灰度膨胀腐蚀算子
图像最大值最小值滤波。效果如下:
1.1. 常规实现
1.1.1. 半径范围遍历
对于一个像素,其膨胀腐蚀结果,查看周围半径范围内的所有像素,取最大最小值。
uint8_t nMax = 0;
for (int j = -nRY; j <= nRY; j++)
{
for (int i = -nRX; i <= nRX; i++)
{
int _X = x + i;
int _Y = y + j;
nMax = max(nMax, pSrc[_Y * nW + _X]);
}
}
pDst[y*nW+x] = nMax;
1.1.2. 亚像素
当半径不为整数时,膨胀边缘数据会和周围像素进行插值。这是当前实现半像素套印的方式。
1.1.3. 当前算子存在问题
单像素处理的时间复杂度为O(N^2),当滤波半径较大时,耗时急剧上升。另外,在滤波半径接近整数时,仍然要计算亚像素,增加了耗时。
第2章 灰度膨胀腐蚀算子的优化
2.1. SIMD优化
2.1.1. 原始实现
逐像素,每个像素在半径范围内取最大最小值。
void morph(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY)
{
for (int y = nRY; y < nH - nRY; y++)
{
for (int x = nRX; x < nW - nRX; x++)
{
uint8_t nMax = 0;
for (int j = -nRY; j <= nRY; j++)
{
for (int i = -nRX; i <= nRX; i++)
{
int _X = x + i;
int _Y = y + j;
nMax = max(nMax, pSrc[_Y * nW + _X]);
}
}
pDst[y * nW + x] = nMax;
}
}
}
AMD2700xCPU,2k*1k的灰度图,半径为1,耗时为33mm。
2.1.2. 每次处理32个像素
每个像素在半径范围内取最大最小值。当前VisionLink的实现
void morph2(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY)
{
int x, y;
for (y = nRY; y < nH - nRY; y++)
{
//一次处理32个像素
int w = nW - 2 * nRX;
int n = w / 32;
int t = w % 32;
for (x = 0; x < n; x++)
{
__m256i m0 = _mm256_set1_epi8(0);
for (int j = -nRY; j <= nRY; j++)
{
for (int i = -nRX; i <= nRX; i++)
{
__m256i m1 = _mm256_loadu_si256((const __m256i*)(pSrc + (y + j)*nW + x * 32 + i));
m0 = _mm256_max_epu8(m1, m0);
}
}
_mm256_storeu_si256((__m256i*)(pDst + y*nW + x * 32), m0);
}
for (int k = 0; k < t; k++)
{
uint8_t nMax = 0;
for (int j = -nRY; j <= nRY; j++)
{
for (int i = -nRX; i <= nRX; i++)
{
int _X = x * 32 + i + k;
int _Y = y + j;
nMax = max(nMax, pSrc[_Y*nW + _X]);
}
}
pDst[y*nW + x * 32 + k] = nMax;
}
}
}
耗时统计:2k*1k的灰度图像:
半径 | 耗时 |
---|---|
1 | 0.5ms |
2 | 1.3ms |
3 | 2.4ms |
5 | 5ms |
7 | 10ms |
相比未优化的实现,有60倍的提升。但随着半径的增加,耗时急剧上升。
2.2. O(1)算法原理
针对随机半径,耗时上升的现象,我们期望找到0(1)的灰度膨胀腐蚀算法原理,即耗时不随着半径的增加而增加。
比如下图的原理:
代码实现如下:
void morph3(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY,int bDilation)
{
//const int bDilation = 1;
int x, y;
int i, j, k;
int nSizeX = nRX * 2 + 1;
int nSizeY = nRY * 2 + 1;
uint8_t* G = new uint8_t[nW * nSizeY];
uint8_t* H = new uint8_t[nW * nSizeY];
__m128i m0 = _mm_setzero_si128(), mraw, m1;
auto pSrc1 = pSrc;
int n1 = nW / nSizeX;
int t1 = nW% nSizeX;
int n2 = (nW - nRX * 2) / 16;
int t2 = (nW - nRX * 2) % 16;
__m128i mmask, mmasktail;
if (bDilation)
{
mmask = _mm_set_epi8(nSizeX >= 16 ? 0xff : 0x00, nSizeX >= 15 ? 0xff : 0x00,
nSizeX >= 14 ? 0xff : 0x00, nSizeX >= 13 ? 0xff : 0x00,
nSizeX >= 12 ? 0xff : 0x00, nSizeX >= 11 ? 0xff : 0x00,
nSizeX >= 10 ? 0xff : 0x00, nSizeX >= 9 ? 0xff : 0x00,
nSizeX >= 8 ? 0xff : 0x00, nSizeX >= 7 ? 0xff : 0x00,
nSizeX >= 6 ? 0xff : 0x00, nSizeX >= 5 ? 0xff : 0x00,
nSizeX >= 4 ? 0xff : 0x00, nSizeX >= 3 ? 0xff : 0x00,
nSizeX >= 2 ? 0xff : 0x00, nSizeX >= 1 ? 0xff : 0x00);
mmasktail = _mm_set_epi8(t1 >= 16 ? 0xff : 0x00, t1 >= 15 ? 0xff : 0x00,
t1 >= 14 ? 0xff : 0x00, t1 >= 13 ? 0xff : 0x00,
t1 >= 12 ? 0xff : 0x00, t1 >= 11 ? 0xff : 0x00,
t1 >= 10 ? 0xff : 0x00, t1 >= 9 ? 0xff : 0x00,
t1 >= 8 ? 0xff : 0x00, t1 >= 7 ? 0xff : 0x00,
t1 >= 6 ? 0xff : 0x00, t1 >= 5 ? 0xff : 0x00,
t1 >= 4 ? 0xff : 0x00, t1 >= 3 ? 0xff : 0x00,
t1 >= 2 ? 0xff : 0x00, t1 >= 1 ? 0xff : 0x00);
}
else
{
mmask = _mm_set_epi8(nSizeX < 16 ? 0xff : 0x00, nSizeX < 15 ? 0xff : 0x00,
nSizeX < 14 ? 0xff : 0x00, nSizeX < 13 ? 0xff : 0x00,
nSizeX < 12 ? 0xff : 0x00, nSizeX < 11 ? 0xff : 0x00,
nSizeX < 10 ? 0xff : 0x00, nSizeX < 9 ? 0xff : 0x00,
nSizeX < 8 ? 0xff : 0x00, nSizeX < 7 ? 0xff : 0x00,
nSizeX < 6 ? 0xff : 0x00, nSizeX < 5 ? 0xff : 0x00,
nSizeX < 4 ? 0xff : 0x00, nSizeX < 3 ? 0xff : 0x00,
nSizeX < 2 ? 0xff : 0x00, nSizeX < 1 ? 0xff : 0x00);
mmasktail = _mm_set_epi8(t1 < 16 ? 0xff : 0x00, t1 < 15 ? 0xff : 0x00,
t1 < 14 ? 0xff : 0x00, t1 < 13 ? 0xff : 0x00,
t1 < 12 ? 0xff : 0x00, t1 < 11 ? 0xff : 0x00,
t1 < 10 ? 0xff : 0x00, t1 < 9 ? 0xff : 0x00,
t1 < 8 ? 0xff : 0x00, t1 < 7 ? 0xff : 0x00,
t1 < 6 ? 0xff : 0x00, t1 < 5 ? 0xff : 0x00,
t1 < 4 ? 0xff : 0x00, t1 < 3 ? 0xff : 0x00,
t1 < 2 ? 0xff : 0x00, t1 < 1 ? 0xff : 0x00);
}
//处理得到前nSizeY行的结果
for (k = 0; k < nSizeY; k++) //前nSizeY行
{
auto pSrc1 = pSrc + k * nW;
for (i = 0; i < n1; i++) //每行有多少个分段 n1 = nW / nSizeX
{
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);
for (j = 1; j < nSizeX; j++) //移位比较
{
mraw = _mm_slli_si128(mraw, 1);
if (!bDilation) mraw = _mm_or_si128(mraw, m255);
m1 = bDilation?_mm_max_epu8(m1, mraw): _mm_min_epu8(m1, mraw);
}
memcpy(G + k * nW + i * nSizeX, m1.m128i_u8, nSizeX);
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);
for (j = 1; j < nSizeX; j++) //移位比较
{
mraw = _mm_srli_si128(mraw, 1);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
memcpy(H + k * nW + i * nSizeX, m1.m128i_u8, nSizeX);
}
if (t1 > 0) //边部处理
{
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);
for (j = 1; j < t1; j++) //移位比较
{
mraw = _mm_slli_si128(mraw, 1);
if (!bDilation) mraw = _mm_or_si128(mraw, m255);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
memcpy(G + k * nW + i * nSizeX, m1.m128i_u8, t1);
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmasktail) : _mm_or_si128(mraw, mmasktail);
for (j = 1; j < t1; j++) //移位比较
{
mraw = _mm_srli_si128(mraw, 1);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
memcpy(H + k * nW + i * nSizeX, m1.m128i_u8, t1);
}
}
int pIndex = 0;
for (y = nRY; y < nH - nRY; y++)
{
//计算得到最大值滤波结果
//从G,H中得到滤波结果
for (i = 0; i < n2; i++) //n2 = (nW / nSizeX * nSizeX - nSizeX) / 16; //每行处理多少次
{
__m128i mret = bDilation ? _mm_setzero_si128():_mm_set1_epi8(-1);
for (j = 0; j < nSizeY; j++) //多行比较大小
{
__m128i m1 = _mm_loadu_si128((const __m128i*)(G + j * nW + i * 16 + 2 * nRX));
__m128i m2 = _mm_loadu_si128((const __m128i*)(H + j * nW + i * 16));
m1 = bDilation ? _mm_max_epu8(m1, m2): _mm_min_epu8(m1, m2);
mret = bDilation ? _mm_max_epu8(mret, m1): _mm_min_epu8(mret, m1);
}
_mm_storeu_si128((__m128i*)(pDst + y * nW + i * 16 + nRX), mret);
}
for (k = 0; k < t2; k++) //边部处理
{
uint8_t ret = bDilation ? 0 : 255;
for (j = 0; j < nSizeY; j++)
{
uint8_t a = (uint8_t)*(G + j * nW + i * 16 + 2 * nRX + j);
uint8_t b = (uint8_t) * (H + j * nW + i * 16 + j);
a = bDilation ? max(a, b):min(a,b);
ret = bDilation ? max(ret, a):min(ret,a);
}
*(pDst + y * nW + i * 16 + nRX + k) = ret;
}
//更新下一行结果
if (y == nH - nRY - 1)
break;
auto pSrc1 = pSrc + (y + nRY + 1) * nW;
for (i = 0; i < n1; i++) //每行有多少个分段 n1 = nW / nSizeX
{
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);
for (int j = 1; j < nSizeX; j++) //移位比较
{
mraw = _mm_slli_si128(mraw, 1);
if (!bDilation) mraw = _mm_or_si128(mraw, m255);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
//_mm_storeu_si128((__m128i*)(GH + pIndex*2 * nW + i * nSizeX), m1);
memcpy(G + pIndex * nW + i * nSizeX, m1.m128i_u8, nSizeX);
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);
for (int j = 1; j < nSizeX; j++) //移位比较
{
mraw = _mm_srli_si128(mraw, 1);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
//_mm_storeu_si128((__m128i*)(GH + (pIndex*2+1) * nW + i * nSize), m1);
memcpy(H + pIndex * nW + i * nSizeX, m1.m128i_u8, nSizeX);
}
if (t1 > 0) //边部处理
{
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);
for (j = 1; j < t1; j++) //移位比较
{
mraw = _mm_slli_si128(mraw, 1);
if (!bDilation) mraw = _mm_or_si128(mraw, m255);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
memcpy(G + pIndex * nW + i * nSizeX, m1.m128i_u8, t1);
mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));
m1 = mraw = bDilation ? _mm_and_si128(mraw, mmasktail) : _mm_or_si128(mraw, mmasktail);
for (j = 1; j < t1; j++) //移位比较
{
mraw = _mm_srli_si128(mraw, 1);
m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);
}
memcpy(H + pIndex * nW + i * nSizeX, m1.m128i_u8, t1);
}
pIndex = (pIndex + 1) % nSizeY;
}
//delete[] GH;
delete[] G;
delete[] H;
}
在如下链接https://www.cnblogs.com/Imageshop/p/7018510.html,有一个另外的实现,本实现占用空间少,且耗时更短。唯一的问题的支持的最大半径为7,可以通过升级为AVX2,最大支持15。
耗时统计如下:
半径 | 耗时 |
---|---|
1 | 2.2ms |
2 | 2.2ms |
3 | 2.3ms |
5 | 2.6ms |
7 | 2.7ms |
基本上做到了O(1),但是,基础的耗时太长。
2.3. halcon,opencv
那么,halcon,opencv可以做到多块呢?
以下是Halcon测试代码:
HObject ho_Image1, ho_Red, ho_Green, ho_Blue;
HObject ho_GrayImage, ho_ImageMin;
SetSystem("parallelize_operators", "false");
ReadImage(&ho_Image1, "D:/1.bmp");
//Decompose3(ho_Image1, &ho_Red, &ho_Green, &ho_Blue);
Rgb1ToGray(ho_Image1, &ho_GrayImage);
auto start = high_resolution_clock::now();
for (int i = 0; i < 1000; i++)
GrayDilationShape(ho_GrayImage, &ho_ImageMin, 9, 9, "rectangle");
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "t:" << duration.count() / 1000.0 << "ms" << endl;
WriteImage(ho_ImageMin, "bmp", 0, "D:/2H.bmp");
通过SetSystem(“parallelize_operators”, “false”);来控制是否开启并行,以防halcon不讲武德。
以及OpenCV的测试代码:
cv::Mat src, dst;
src = cv::imread("D:\\1.bmp", cv::IMREAD_GRAYSCALE); // Read the file
if (src.empty())
{
cout << "Could not open or find the image" << endl;
return -1;
}
cv::Mat element = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(15, 15));
auto start = high_resolution_clock::now();
for(int i = 0;i<10000;i++)
cv::morphologyEx(src, dst, cv::MORPH_ERODE, element);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Time taken by function: "
<< duration.count()/1000.0<< "ms" << endl;
imwrite("D:\\2.bmp", dst);
耗时分别如下:
半径 | halcon | Halcon 并行 | OpenCV |
---|---|---|---|
1 | 0.3 | 0.2 | 0.3 |
2 | 0.6 | 0.3 | 0.4 |
3 | 0.9 | 0.4 | 0.6 |
5 | 1.3 | 0.6 | 0.8 |
7 | 2.3 | 0.8 | 1.1 |
2.4. 行列分开方式
既然halcon也没有使用O(1)的算法原理,说明O(1)是有代价的,基础耗时就比较高。至少在半径小于10的情况下,Halcon的灰度膨胀腐蚀算子耗时都是递增的。
再就是行列分开的方法了,代码实现如下:
void morph4(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY, int bDilation)
{
int halfRow = nRY;
int halfCol = nRX;
uint8_t* pTmp = new uint8_t[nW * nH];
int nCol = nW % 32 == 0 ? nW / 32 : nW / 32 + 1;
int* nOffsetX = new int[nCol];
memset(nOffsetX, 0, sizeof(int) * nCol);
nOffsetX[nCol - 1] = nW % 32 == 0 ? 0 : 32 - nW % 32;
//处理列
for (int i = 0; i < nH; i++)
{
uint8_t* pSrc1 = pSrc + i * nW;
uint8_t* pTmp1 = pTmp + i * nW;
for (int j = 0; j < nW; j += 32