目录
简介
YOLOv3(You Only Look Once version 3)是一种高效的实时目标检测算法,由Joseph Redmon和Ali Farhadi于2018年提出。与传统的目标检测方法相比,YOLO将目标检测视为单一的回归问题,直接从完整图像预测边界框及其类别概率,使其成为速度和准确性之间平衡的优秀选择。
本教程适合以下人群:
- 对计算机视觉和深度学习感兴趣的初学者
- 想要快速实现目标检测功能的开发者
- 需要在项目中集成目标检测功能的学生或研究人员
环境准备
系统要求
- Python 3.6+
- CPU或GPU(推荐NVIDIA GPU以提高性能)
- 至少4GB内存(推荐8GB以上)
安装必要的库
首先,我们需要创建一个虚拟环境(可选但推荐):
bash
# 创建虚拟环境
python -m venv yolov3_env
# 激活虚拟环境
# Windows:
yolov3_env\Scripts\activate
# Linux/Mac:
source yolov3_env/bin/activate
安装必要的库:
bash
# 安装基本库
pip install numpy opencv-python-headless matplotlib pillow
# 使用CPU版PyTorch
pip install torch torchvision
# 或使用GPU版PyTorch (根据你的CUDA版本选择适当的命令)
# 请访问 https://pytorch.org/get-started/locally/ 获取适用于你系统的安装命令
如果你想使用官方的darknet实现,你需要克隆并编译darknet库:
bash
git clone https://github.com/AlexeyAB/darknet.git
cd darknet
# 编辑Makefile,设置GPU=1、CUDNN=1(如果有GPU)
# 在Windows上,使用Makefile.win文件
# Linux/Mac编译:
make
或者,我们可以使用更简单的Python实现,如Ultralytics的YOLOv3:
bash
pip install ultralytics
获取预训练模型
YOLOv3提供了多种预训练模型,最常用的是在COCO数据集上训练的模型。
使用Ultralytics YOLOv3
bash
# 下载预训练权重
wget https://github.com/ultralytics/yolov3/releases/download/v9.0/yolov3.pt -O yolov3.pt
如果使用原始的darknet实现:
bash
# 下载配置文件和权重
wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3.cfg
wget https://pjreddie.com/media/files/yolov3.weights
COCO数据集类别
YOLOv3预训练模型可以检测80种不同的物体,包括人、车辆、动物和日常物品。完整的类别列表如下:
python
COCO_CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]
图像目标检测
现在我们开始实际操作,使用YOLOv3模型检测图像中的目标。
方法1:使用Ultralytics YOLOv3(推荐新手使用)
创建一个Python脚本 detect_image.py
:
python
from ultralytics import YOLO
import cv2
import numpy as np
import time
def detect_objects(image_path, conf_threshold=0.25):
# 加载模型
model = YOLO('yolov3.pt')
# 读取图像
img = cv2.imread(image_path)
if img is None:
print(f"无法读取图像: {image_path}")
return
# 记录开始时间
start_time = time.time()
# 执行推理
results = model(img)
# 计算推理时间
inference_time = time.time() - start_time
print(f"推理时间: {inference_time:.2f}秒")
# 获取检测结果
result = results[0]
# 在图像上绘制检测结果
for box in result.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
conf = float(box.conf[0])
cls_id = int(box.cls[0])
if conf >= conf_threshold:
label = f"{result.names[cls_id]} {conf:.2f}"
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(img, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# 保存和显示结果
output_path = "output_" + image_path.split("/")[-1]
cv2.imwrite(output_path, img)
print(f"已保存结果到: {output_path}")
# 显示结果 (可选,如果在有图形界面的环境下)
cv2.imshow("Detection Result", img)
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == "__main__":
# 替换为你自己的图像路径
image_path = "test_image.jpg"
detect_objects(image_path)
方法2:使用OpenCV的DNN模块
如果你想使用OpenCV的DNN模块直接加载darknet模型,可以使用以下代码:
python
import cv2
import numpy as np
import time
import argparse
def detect_objects_opencv(image_path, config_path, weights_path, conf_threshold=0.5, nms_threshold=0.4):
# 加载类别名称
with open('coco.names', 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# 设置随机颜色
np.random.seed(42)
colors = np.random.randint(0, 255, size=(len(classes), 3), dtype=np.uint8)
# 加载网络
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
# 检查可用的计算后端
backend = cv2.dnn.DNN_BACKEND_OPENCV
target = cv2.dnn.DNN_TARGET_CPU
# 如果有CUDA支持,可以启用GPU
if cv2.cuda.getCudaEnabledDeviceCount() > 0:
backend = cv2.dnn.DNN_BACKEND_CUDA
target = cv2.dnn.DNN_TARGET_CUDA
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
# 获取输出层名称
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
# 读取图像
img = cv2.imread(image_path)
if img is None:
print(f"无法读取图像: {image_path}")
return
height, width = img.shape[:2]
# 图像预处理: 创建blob
blob = cv2.dnn.blobFromImage(img, 1/255.0, (416, 416), swapRB=True, crop=False)
net.setInput(blob)
# 记录开始时间
start_time = time.time()
# 前向传播,获取检测结果
outputs = net.forward(output_layers)
# 计算推理时间
inference_time = time.time() - start_time
print(f"推理时间: {inference_time:.2f}秒")
# 处理检测结果
class_ids = []
confidences = []
boxes = []
for output in outputs:
for detection in output:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > conf_threshold:
# 目标位置
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
# 计算左上角坐标
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
class_ids.append(class_id)
# 非最大抑制,移除重叠框
indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
# 绘制检测结果
for i in indices:
i = i if isinstance(i, int) else i[0] # 处理不同OpenCV版本的兼容性
box = boxes[i]
x, y, w, h = box
label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
color = [int(c) for c in colors[class_ids[i]]]
# 绘制边界框和标签
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
cv2.putText(img, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
# 保存结果
output_path = "output_" + image_path.split("/")[-1]
cv2.imwrite(output_path, img)
print(f"已保存结果到: {output_path}")
# 显示结果 (可选)
cv2.imshow("Detection Result", img)
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='YOLOv3目标检测')
parser.add_argument('--image', type=str, default='test_image.jpg', help='输入图像路径')
parser.add_argument('--config', type=str, default='yolov3.cfg', help='模型配置文件路径')
parser.add_argument('--weights', type=str, default='yolov3.weights', help='模型权重文件路径')
parser.add_argument('--conf', type=float, default=0.5, help='置信度阈值')
args = parser.parse_args()
# 在运行前确保你有coco.names文件
# 可以从这里下载: https://github.com/AlexeyAB/darknet/blob/master/data/coco.names
detect_objects_opencv(args.image, args.config, args.weights, args.conf)
确保你在同一目录下有一个coco.names
文件,包含所有COCO类别:
bash
# 下载coco.names文件
wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/data/coco.names
视频目标检测
现在让我们扩展到视频目标检测,这对于监控、行为分析等应用非常有用。
使用Ultralytics YOLOv3处理视频
创建一个脚本detect_video.py
:
python
from ultralytics import YOLO
import cv2
import time
import argparse
def detect_video(video_path, conf_threshold=0.25, output_path=None):
# 加载模型
model = YOLO('yolov3.pt')
# 打开视频文件或摄像头
if video_path.isdigit():
cap = cv2.VideoCapture(int(video_path)) # 摄像头
else:
cap = cv2.VideoCapture(video_path) # 视频文件
if not cap.isOpened():
print(f"无法打开视频源: {video_path}")
return
# 获取视频参数
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
# 准备输出视频
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
else:
out = None
# 处理视频帧
frame_count = 0
start_time = time.time()
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# 执行检测
results = model(frame)
result = results[0]
# 在帧上绘制检测结果
for box in result.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
conf = float(box.conf[0])
cls_id = int(box.cls[0])
if conf >= conf_threshold:
label = f"{result.names[cls_id]} {conf:.2f}"
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# 显示实时FPS
current_time = time.time()
elapsed_time = current_time - start_time
fps_text = f"FPS: {frame_count / elapsed_time:.2f}"
cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# 保存输出帧
if out:
out.write(frame)
# 显示结果
cv2.imshow("Video Detection", frame)
# 按'q'键退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# 清理资源
cap.release()
if out:
out.release()
cv2.destroyAllWindows()
print(f"处理了 {frame_count} 帧,平均 FPS: {frame_count / elapsed_time:.2f}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='YOLOv3视频目标检测')
parser.add_argument('--video', type=str, default='0', help='输入视频路径或摄像头索引 (默认为0,表示默认摄像头)')
parser.add_argument('--conf', type=float, default=0.25, help='置信度阈值')
parser.add_argument('--output', type=str, default=None, help='输出视频路径 (可选)')
args = parser.parse_args()
detect_video(args.video, args.conf, args.output)
使用OpenCV DNN处理视频
如果你想使用OpenCV DNN模块处理视频:
python
import cv2
import numpy as np
import time
import argparse
def detect_video_opencv(video_path, config_path, weights_path, conf_threshold=0.5, nms_threshold=0.4, output_path=None):
# 加载类别名称
with open('coco.names', 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# 设置随机颜色
np.random.seed(42)
colors = np.random.randint(0, 255, size=(len(classes), 3), dtype=np.uint8)
# 加载网络
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
# 检查可用的计算后端
backend = cv2.dnn.DNN_BACKEND_OPENCV
target = cv2.dnn.DNN_TARGET_CPU
# 如果有CUDA支持,可以启用GPU
if cv2.cuda.getCudaEnabledDeviceCount() > 0:
backend = cv2.dnn.DNN_BACKEND_CUDA
target = cv2.dnn.DNN_TARGET_CUDA
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
# 获取输出层名称
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
# 打开视频文件或摄像头
if video_path.isdigit():
cap = cv2.VideoCapture(int(video_path)) # 摄像头
else:
cap = cv2.VideoCapture(video_path) # 视频文件
if not cap.isOpened():
print(f"无法打开视频源: {video_path}")
return
# 获取视频参数
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
# 准备输出视频
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
else:
out = None
# 处理视频帧
frame_count = 0
start_time = time.time()
processing_times = []
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
frame_start_time = time.time()
# 图像预处理
blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
net.setInput(blob)
# 前向传播
outputs = net.forward(output_layers)
# 处理检测结果
class_ids = []
confidences = []
boxes = []
for output in outputs:
for detection in output:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > conf_threshold:
# 目标位置
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
# 计算左上角坐标
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
class_ids.append(class_id)
# 非最大抑制
indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
# 绘制检测结果
for i in indices:
i = i if isinstance(i, int) else i[0] # 处理不同OpenCV版本的兼容性
box = boxes[i]
x, y, w, h = box
label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
color = [int(c) for c in colors[class_ids[i]]]
# 绘制边界框和标签
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
cv2.putText(frame, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
# 计算处理时间
frame_time = time.time() - frame_start_time
processing_times.append(frame_time)
# 显示实时FPS
current_time = time.time()
elapsed_time = current_time - start_time
fps_text = f"FPS: {frame_count / elapsed_time:.2f}"
cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# 保存输出帧
if out:
out.write(frame)
# 显示结果
cv2.imshow("Video Detection", frame)
# 按'q'键退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# 清理资源
cap.release()
if out:
out.release()
cv2.destroyAllWindows()
# 打印统计信息
avg_time = sum(processing_times) / len(processing_times) if processing_times else 0
print(f"处理了 {frame_count} 帧")
print(f"平均处理时间: {avg_time:.4f} 秒/帧")
print(f"平均 FPS: {1 / avg_time:.2f}" if avg_time > 0 else "无法计算 FPS")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='YOLOv3视频目标检测')
parser.add_argument('--video', type=str, default='0', help='输入视频路径或摄像头索引 (默认为0,表示默认摄像头)')
parser.add_argument('--config', type=str, default='yolov3.cfg', help='模型配置文件路径')
parser.add_argument('--weights', type=str, default='yolov3.weights', help='模型权重文件路径')
parser.add_argument('--conf', type=float, default=0.5, help='置信度阈值')
parser.add_argument('--output', type=str, default=None, help='输出视频路径 (可选)')
args = parser.parse_args()
detect_video_opencv(args.video, args.config, args.weights, args.conf, 0.4, args.output)
模型性能优化
YOLOv3是一个功能强大的模型,但在资源有限的环境中,可能需要进行一些优化。以下是一些实用的优化技巧:
1. 降低输入图像分辨率
默认情况下,YOLOv3使用416×416的输入分辨率,但你可以降低它以提高速度:
python
# 例如,将分辨率降低到320×320
blob = cv2.dnn.blobFromImage(img, 1/255.0, (320, 320), swapRB=True, crop=False)
2. 使用YOLOv3-tiny
YOLOv3-tiny是YOLOv3的轻量级版本,速度更快但准确性略低:
bash
# 下载YOLOv3-tiny的配置和权重
wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-tiny.cfg
wget https://pjreddie.com/media/files/yolov3-tiny.weights
然后修改代码,使用这些文件。
3. 利用GPU加速(如果可用)
对于OpenCV DNN实现:
python
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
对于Ultralytics实现,它会自动利用可用的GPU。
4. 批处理
如果你需要处理多个图像,可以考虑批处理来提高吞吐量:
python
# 使用Ultralytics YOLOv3的批处理
results = model(batch_of_images, batch_size=4)
5. 量化和优化
对于更高级的优化,可以考虑模型量化和TensorRT优化(需要NVIDIA GPU):
bash
# 使用ONNX和TensorRT(需要额外步骤)
pip install onnx onnxruntime-gpu
# 转换模型到ONNX格式,然后使用TensorRT优化
常见问题解答
1. 检测结果不准确怎么办?
- 尝试调整置信度阈值(conf_threshold)和NMS阈值(nms_threshold)
- 检查图像预处理步骤是否正确
- 确保模型权重和配置文件匹配
2. 为什么我的检测速度很慢?
- 如果没有GPU,速度会受限
- 尝试使用YOLOv3-tiny版本
- 减小输入分辨率
- 确保你的OpenCV是带CUDA支持编译的(用于GPU加速)
3. 如何处理大型图像?
- 将大图像分割成较小的块进行处理
- 降低图像分辨率,但这可能导致小目标检测性能下降
4. 为什么某些物体没有被检测到?
- 小目标往往更难检测,尝试增加输入分辨率
- 某些物体可能不在COCO数据集的80个类别中
- 物体可能被遮挡或光照条件不佳
5. 模型文件太大,有更小的替代方案吗?
- 使用YOLOv3-tiny(约34MB)代替完整的YOLOv3(约240MB)
- 对模型进行量化可以进一步减小体积