How to merge ROI OpenCV Code + Degirum SDK inference

I am wondering how to merge these two code bases, so I can dynamically change the ROI, while still using the degirum SDK for inference. Tried a few solutions did not land on anything: regards!

<opencv_code>

import cv2
import datetime

cap = cv2.VideoCapture(1)
if not cap.isOpened():
print(“Camera cannot open”)
exit()

cv2.namedWindow(“Camera”, cv2.WND_PROP_FULLSCREEN)
cv2.setWindowProperty(“Camera”, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

SCREEN_W, SCREEN_H = 1280, 720

globalz = {
‘zoom_step’ : 0.01,
‘zoom_xy_min’ : 0.0,
‘zoom_x’ : 0.0,
‘zoom_y’ : 0.0,
‘zoom_xy_max’ : 0.4,
‘zoom_wh_min’ : 1.0,
‘zoom_wh’ : 1.0,
‘zoom_wh_max’ : 0.2,
}

overlay_visible = True
annotation_text = “”
annotation_timer = 0

def set_annotation(text, duration=60):
global annotation_text, annotation_timer
annotation_text = text
annotation_timer = duration

def apply_roi(frame):
h, w = frame.shape[:2]
x = int(globalz[‘zoom_x’] * w)
y = int(globalz[‘zoom_y’] * h)
cw = int(globalz[‘zoom_wh’] * w)
ch = int(globalz[‘zoom_wh’] * h)
x = max(0, min(x, w - 1))
y = max(0, min(y, h - 1))
cw = max(1, min(cw, w - x))
ch = max(1, min(ch, h - y))
cropped = frame[y:y+ch, x:x+cw]
return cv2.resize(cropped, (SCREEN_W, SCREEN_H))

def zoom_in():
if (globalz[‘zoom_x’] + globalz[‘zoom_step’] > globalz[‘zoom_xy_max’] and
globalz[‘zoom_y’] + globalz[‘zoom_step’] > globalz[‘zoom_xy_max’]):
set_annotation(“At max zoom”)
else:
globalz[‘zoom_x’] += globalz[‘zoom_step’]
globalz[‘zoom_y’] += globalz[‘zoom_step’]
globalz[‘zoom_wh’] -= globalz[‘zoom_step’] * 2
set_annotation(“Zoom In”)

def zoom_out():
if (globalz[‘zoom_x’] - globalz[‘zoom_step’] < globalz[‘zoom_xy_min’] and
globalz[‘zoom_y’] - globalz[‘zoom_step’] < globalz[‘zoom_xy_min’]):
set_annotation(“At min zoom”)
else:
globalz[‘zoom_x’] -= globalz[‘zoom_step’]
globalz[‘zoom_y’] -= globalz[‘zoom_step’]
globalz[‘zoom_wh’] += globalz[‘zoom_step’] * 2
set_annotation(“Zoom Out”)

def set_min_zoom():
globalz[‘zoom_x’] = globalz[‘zoom_xy_min’]
globalz[‘zoom_y’] = globalz[‘zoom_xy_min’]
globalz[‘zoom_wh’] = globalz[‘zoom_wh_min’]
set_annotation(“Zoom reset”)

def pan_right():
globalz[‘zoom_y’] = min(globalz[‘zoom_y’] + globalz[‘zoom_step’], globalz[‘zoom_xy_max’])
set_annotation(“Pan Right”)

def pan_left():
globalz[‘zoom_y’] = max(globalz[‘zoom_y’] - globalz[‘zoom_step’], globalz[‘zoom_xy_min’])
set_annotation(“Pan Left”)

def pan_up():
globalz[‘zoom_x’] = max(globalz[‘zoom_x’] - globalz[‘zoom_step’], globalz[‘zoom_xy_min’])
set_annotation(“Pan Up”)

def pan_down():
globalz[‘zoom_x’] = min(globalz[‘zoom_x’] + globalz[‘zoom_step’], globalz[‘zoom_xy_max’])
set_annotation(“Pan Down”)

def take_picture(frame):
filename = datetime.datetime.now().strftime(“%Y-%m-%d_%H.%M.%S.jpg”)
cv2.imwrite(filename, frame)
set_annotation(f"Saved: {filename}“)
print(f"Picture saved: {filename}”)

def draw_overlay(frame):
global annotation_text, annotation_timer
font = cv2.FONT_HERSHEY_PLAIN
lines = [
“PiGlassV2”,
“P: Take Picture”,
“+/-: Zoom In/Out”,
“Arrow keys: Pan”,
“R: Reset Zoom”,
“O: Toggle Overlay”,
“Q: Quit”,
]
y = SCREEN_H - len(lines) * 30 - 10
for line in lines:
cv2.putText(frame, line, (10, y), font, 2, (255, 255, 255), 2)
y += 30
if annotation_timer > 0:
cv2.putText(frame, annotation_text,
(SCREEN_W // 2 - 200, SCREEN_H // 2),
font, 3, (255, 255, 0), 3)
annotation_timer -= 1

KEY_UP = [82, 2490368, 65362]
KEY_DOWN = [84, 2621440, 65364]
KEY_LEFT = [81, 2424832, 65361]
KEY_RIGHT = [83, 2555904, 65363]

while True:
ret, frame = cap.read()
if not ret:
break

display = apply_roi(frame)

# ← your DeGirum inference goes here, operating on `display`

if overlay_visible:
    draw_overlay(display)

cv2.imshow("Camera", display)

key = cv2.waitKey(1) & 0xFFFFFF
if   key == ord('q'):                    break
elif key == ord('+') or key == ord('='): zoom_in()
elif key == ord('-'):                    zoom_out()
elif key in KEY_UP:                      pan_up()
elif key in KEY_DOWN:                    pan_down()
elif key in KEY_LEFT:                    pan_left()
elif key in KEY_RIGHT:                   pan_right()
elif key == ord('r'):                    set_min_zoom()
elif key == ord('p'):                    take_picture(display)
elif key == ord('o'):
    overlay_visible = not overlay_visible
    set_annotation("Overlay " + ("ON" if overlay_visible else "OFF"))

cap.release()
cv2.destroyAllWindows() </opencv_code>

<degirum_code>

with degirum_tools.Display(‘AI Camera’) as output_display, degirum_tools.open_video_writer(‘output_license_plate_video.mp4’,
w = 640, h = 480, fps = 30.0) as video_writer:
for detected_license_plates in degirum_tools.predict_stream(lp_det_model, video_source):
if detected_license_plates.results:

cropped_license_plates = crop_images(detected_license_plates.image, detected_license_plates.results)
print(f’cropped license plates: {cropped_license_plates}‘)
for index, cropped_license_plate in enumerate(cropped_license_plates):
ocr_results = lpocr_model.predict(cropped_license_plate)
print(f’MOST IMPORTANT: OCR RESULTS: {ocr_results}’)
license_plate = rearrange_detections(ocr_results.results)
print(f’final OCR text: {license_plate}')
#detected_license_plates.results[index][‘label’] = license_plate
video_writer.write(detected_license_plates.image)
output_display.show(detected_license_plates)

</degirum_code>

Hi @svafadar19 ,

It is easy to do: just define a source for predict_batch() which returns cropped part of the image.

The following is working sample script to demonstrate this approach:

import degirum as dg
import cv2

VIDEO_SOURCE = 0
STEP = 20  # pixels per key press

# Arrow key codes
KEY_LEFT  = 2424832
KEY_RIGHT = 2555904
KEY_UP    = 2490368
KEY_DOWN  = 2621440

# Crop region: center (cx, cy) and size (w, h)
crop = {"cx": 320, "cy": 240, "w": 400, "h": 300}


def cropped_frame_source(path, crop):
    """Generator that reads frames from video and yields center-cropped regions."""
    cap = cv2.VideoCapture(path)
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            fh, fw = frame.shape[:2]
            cx, cy = crop["cx"], crop["cy"]
            hw, hh = crop["w"] // 2, crop["h"] // 2
            x1 = max(0, cx - hw)
            y1 = max(0, cy - hh)
            x2 = min(fw, cx + hw)
            y2 = min(fh, cy + hh)
            yield frame[y1:y2, x1:x2]
    finally:
        cap.release()


model = dg.load_model(
    model_name="yolov8n_relu6_coco--640x640_quant_n2x_orca1_1",
    inference_host_address="@cloud",
    zoo_url="degirum/degirum",
)

WINDOW = "Cropped Detection"
cv2.namedWindow(WINDOW)

try:
    for result in model.predict_batch(cropped_frame_source(VIDEO_SOURCE, crop)):
        cv2.imshow(WINDOW, result.image_overlay)

        key = cv2.waitKeyEx(1)  # waitKeyEx captures arrow/extended keys on Windows
        if key in (ord("q"), 27):           # q / ESC → quit
            break
        elif key in (ord("+"), ord("=")):   # + → grow crop
            crop["w"] += STEP
            crop["h"] += STEP
        elif key == ord("-"):               # - → shrink crop
            crop["w"] = max(STEP * 2, crop["w"] - STEP)
            crop["h"] = max(STEP * 2, crop["h"] - STEP)
        elif key == KEY_LEFT:               # ← → move center left
            crop["cx"] -= STEP
        elif key == KEY_RIGHT:              # → → move center right
            crop["cx"] += STEP
        elif key == KEY_UP:                 # ↑ → move center up
            crop["cy"] -= STEP
        elif key == KEY_DOWN:               # ↓ → move center down
            crop["cy"] += STEP
finally:
    cv2.destroyAllWindows()

Thank you for the code. Is there any way to run two models with your code?

Absolutely! How would you like to run two models? In parallel, processing the same image? Or sequentially, so the second model analyzes bboxes produced by the first model?

Parallel would be preferred :slight_smile:

In this case the easiest way is to use degirum_tools.CombiningCompoundModel class. It executes two models in parallel on the same input data and merges their results. It expects both models to be of the same kind, so results can be combined. Such compound model object can be used as regular model object: it has the same predicT_batch() method, delivering compined result.

You may take a look into this example:
hand_face_person_detection_parallel_video_stream.ipynb - Colab

1 Like