开发板PYNQ-Z1
代码如下,严重掉帧,会延迟大约三四秒,请问应该怎么优化?
用多进程或VDMA的优化方法要怎么实现?
import cv2
from pynq.overlays.base import BaseOverlay
from pynq.lib.video import *
print("base.bit")
base = BaseOverlay("base.bit")
# monitor configuration: 640*480 @ 60Hz
Mode = VideoMode(640, 480, 24)
hdmi_out = base.video.hdmi_out
hdmi_out.configure(Mode, PIXEL_BGR)
hdmi_out.start()
# camera (input) configuration
frame_in_w = 640
frame_in_h = 480
videoIn = cv2.VideoCapture(0)
videoIn.set(cv2.CAP_PROP_FRAME_WIDTH, frame_in_w);
videoIn.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_in_h);
print("capture device is open: " + str(videoIn.isOpened()))
face_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_eye.xml')
while True:
ret, frame_vga = videoIn.read()
if ret==0:
break
np_frame = frame_vga
gray = cv2.cvtColor(frame_vga, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
cv2.rectangle(np_frame,(x,y),(x+w,y+h),(255,0,0),2)
roi_gray = gray[y:y+h, x:x+w]
roi_color = np_frame[y:y+h, x:x+w]
eyes = eye_cascade.detectMultiScale(roi_gray)
for (ex,ey,ew,eh) in eyes:
cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)
if (ret):
outframe = hdmi_out.newframe()
outframe[:] = np_frame
hdmi_out.writeframe(outframe)
else:
raise RuntimeError("Error while reading from camera.")
你可以尝试以下几种优化措施:
import cv2
import time
from pynq.overlays.base import BaseOverlay
from pynq.lib.video import *
print("base.bit")
base = BaseOverlay("base.bit")
# monitor configuration: 640*480 @ 60Hz
Mode = VideoMode(640, 480, 24)
hdmi_out = base.video.hdmi_out
hdmi_out.configure(Mode, PIXEL_BGR)
hdmi_out.start()
# camera (input) configuration
frame_in_w = 640
frame_in_h = 480
videoIn = cv2.VideoCapture(0)
videoIn.set(cv2.CAP_PROP_FRAME_WIDTH, frame_in_w);
videoIn.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_in_h);
print("capture device is open: " + str(videoIn.isOpened()))
face_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_eye.xml')
frame_rate = 10 # 每秒处理10帧视频
delay = 1/frame_rate
while True:
ret, frame_vga = videoIn.read()
if ret==0:
break
np_frame = frame_vga
gray = cv2.cvtColor(frame_vga, cv2.COLOR_BGR2GRAY)
# 只检测视频帧的中心区域
x, y, w, h = int(frame_in_w/4), int(frame_in_h/4), int(frame_in_w/2), int(frame_in_h/2)
gray = gray[y:y+h, x:x+w]
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
cv2.rectangle(np_frame,(x+int(frame_in_w/4),y+int(frame_in_h/4)),
(x+int(frame_in_w/4)+w,y+int(frame_in_h/4)+h),(255,0,0),2)
roi_gray = gray[y:y+h, x:x+w]
roi_color = np_frame[y+int(frame_in_h/4):y+int(frame_in_h/4)+h,
x+int(frame_in_w/4):x+int(frame_in_w/4)+w]
eyes = eye_cascade.detectMultiScale
需要添加添加VDMA缓存Sensor
要使用多线程优化上述代码,首先要使用多线程库,比如Python的多线程库threading,将代码中的任务分解到多个线程中。比如,可以将图像处理任务分解到一个线程中,将人脸检测任务分解到另一个线程中,然后将结果输出到HDMI输出。另外,还可以使用多线程将图像处理任务分解到多个芯片中,以提高处理速度。
为了优化代码,可以使用VDMA来实现视频输入和输出,以减少CPU负载。可以使用PYNQ库中的Video_in和Video_out类来实现,这样可以实现视频流的高效处理。此外,还可以使用OpenCV的硬件加速功能,比如使用硬件加速的haar级联分类器来提高检测速度。
由于您的代码没有使用多进程或者VDMA技术,掉帧和延迟的问题可能与以下因素有关:
如果您想使用多进程或VDMA技术来优化代码,可以考虑以下方法:
如果您决定使用多进程或VDMA技术来优化代码,可以参考以下教程来实现:
根据您提供的代码,您正在捕捉USB摄像头并将视频帧显示在PYNQ-Z1板子的HDMI输出上。掉帧和延迟可能与以下因素有关:
USB带宽:USB总线上的带宽限制可能导致掉帧。因为在您的代码中,图像是通过USB接口从摄像头获取的,然后传输到HDMI接口。您可以尝试减少图像大小或调整帧速率来缓解此问题。
图像处理复杂度:您的代码包含对图像的检测和分类。这些操作可能会导致计算资源不足,从而导致延迟和掉帧。您可以尝试将处理移动到FPGA上来解决此问题。
实现方式:多进程和VDMA都是优化延迟和掉帧的有效方式。使用多进程,您可以将图像捕获和处理拆分为不同的进程。使用VDMA,您可以在处理和输出之间直接传输数据,而无需将它们复制到内存中。
下面是使用多进程的示例代码,将图像处理拆分为两个进程。一个进程负责捕获图像并将其放入队列中,另一个进程从队列中获取图像并进行处理:
import cv2
from pynq.overlays.base import BaseOverlay
from pynq.lib.video import *
from multiprocessing import Process, Queue
print("base.bit")
base = BaseOverlay("base.bit")
# monitor configuration: 640*480 @ 60Hz
Mode = VideoMode(640, 480, 24)
hdmi_out = base.video.hdmi_out
hdmi_out.configure(Mode, PIXEL_BGR)
hdmi_out.start()
# camera (input) configuration
frame_in_w = 640
frame_in_h = 480
videoIn = cv2.VideoCapture(0)
videoIn.set(cv2.CAP_PROP_FRAME_WIDTH, frame_in_w);
videoIn.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_in_h);
print("capture device is open: " + str(videoIn.isOpened()))
face_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_eye.xml')
def read_frames(input_queue):
while True:
ret, frame_vga = videoIn.read()
if not ret:
input_queue.put(None)
break
input_queue.put(frame_vga)
def process_frames(input_queue, output_queue):
while True:
frame_vga = input_queue.get()
if frame_vga is None:
output_queue.put(None)
break
np_frame = frame_vga
gray = cv2.cvtColor(frame_vga, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
cv2.rectangle(np_frame,(x,y),(x+w,y+h),(255,0,0),2)
roi_gray = gray[y:y+h, x:x+w]
roi_color = np_frame[y:y+h, x:x+w]
eyes = eye_cascade.detectMultiScale(roi_gray)
for (ex,ey,ew,eh) in eyes:
cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)
output_queue.put(np_frame)
def write_frames(output_queue):
while True:
np_frame = output_queue.get()
if np_frame is None:
break
outframe = hdmi_out.newframe()
outframe[:] = np_frame
hdmi_out.writeframe(outframe)
# create queues for inter-process communication
input_queue = Queue()
output_queue = Queue()
# create and start the processes
processes = [
Process(target=read_frames, args=(input_queue,)),
Process(target=process_frames, args=(input_queue, output_queue)),
Process(target=write_frames, args=(output_queue,))
]
for process in processes:
process.start()
# wait for all the processes to finish
for process in processes:
process.join()
# release the resources
videoIn.release()
hdmi_out.stop()
del hdmi_out
上述代码中,将处理每个视频帧的过程拆分成了三个步骤,并用不同的进程来执行这些步骤。read_frames()函数从相机读取视频帧,并将它们放入输入队列中。process_frames()函数从输入队列中获取视频帧并执行面部识别和绘制
您的代码存在掉帧的问题,这可能是由于数据传输速度的限制或CPU计算能力不足导致的。您可以采取以下措施来优化代码:
1 使用多进程并行化:
使用Python的multiprocessing模块可以将不同的处理任务分配给不同的进程来并行处理,以提高整体处理速度。您可以将读取帧和处理帧的任务分别分配给两个进程,以减轻CPU的负担。这里需要注意的是,由于PYNQ开发板中的CPU核心数量有限,所以不要同时启动过多的进程,否则可能会导致进程之间的资源竞争和调度延迟。
2 使用VDMA模块:
PYNQ开发板提供了Xilinx的VDMA(Video Direct Memory Access)模块,它可以在FPGA中实现高带宽的数据传输,减轻CPU的负担。您可以将读取帧和处理帧的任务分别分配给CPU和FPGA,以实现高效的数据传输和处理。
下面是使用多进程的示例代码:
import cv2
from pynq.overlays.base import BaseOverlay
from pynq.lib.video import *
from multiprocessing import Process, Queue
def read_frame(queue):
frame_in_w = 640
frame_in_h = 480
videoIn = cv2.VideoCapture(0)
videoIn.set(cv2.CAP_PROP_FRAME_WIDTH, frame_in_w);
videoIn.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_in_h);
while True:
ret, frame_vga = videoIn.read()
if ret==0:
break
queue.put(frame_vga)
def process_frame(queue):
hdmi_out = base.video.hdmi_out
hdmi_out.configure(Mode, PIXEL_BGR)
hdmi_out.start()
face_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier(
'/home/xilinx/jupyter_notebooks/base/video/data/'
'haarcascade_eye.xml')
while True:
if not queue.empty():
frame_vga = queue.get()
np_frame = frame_vga
gray = cv2.cvtColor(frame_vga, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
cv2.rectangle(np_frame,(x,y),(x+w,y+h),(255,0,0),2)
roi_gray = gray[y:y+h, x:x+w]
roi_color = np_frame[y:y+h, x:x+w]
eyes = eye_cascade.detectMultiScale(roi_gray)
for (ex,ey,ew,eh) in eyes:
cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)
outframe = hdmi_out.newframe()
outframe[:] = np_frame
hdmi_out.writeframe(outframe)
else:
time.sleep(0.01)
if __name__ == '__main__':
print("base.bit")
base = Base
如果对您有帮助,请给与采纳,谢谢。
可以尝试使用多进程或VDMA的优化方法来提高程序的性能。
多进程可以将任务分解到多个进程中,从而提高程序的执行效率。
VDMA可以将视频数据从内存中读取,从而减少CPU的负担,提高程序的执行效率。