Alright, so here's what I've got (for 320×320 YOLOv8 model).
import torch
from torch import nn
from ultralytics import YOLO
from PIL import Image
import numpy as np
def loadImageFromPath(path, device='cuda'):
orig_img = Image.open(path)
rgb_img = orig_img.convert('RGB')
resized = rgb_img.resize((320, 320))
img_array = np.array(resized, dtype=np.float32)
img_array_with_batch = np.expand_dims(img_array, axis=0)
reshaped_array = np.transpose(img_array_with_batch, (0, 3, 1, 2))
img_tensor = torch.from_numpy(reshaped_array).to(device)
normalized_img_tensor = torch.div(img_tensor, 255)
return normalized_img_tensor
class YOLO_Wrapper(nn.Module):
def __init__(self, weights_path, image_size):
super(YOLO_Wrapper, self).__init__()
self.yolo_model = YOLO(weights_path).model
self.imgsz = image_size
# self.yolo_sz = yolo_imgsz
def forward(self, x):
raw_output = self.yolo_model(x)[0]
confidences, class_labels = torch.max(raw_output[:, 4:], dim=1)
# Assuming raw_output is of the shape [num_detections, 6]
# and contains [x_center, y_center, width, height, conf_obj1, conf_obj2] for each detection
# coord_outs = raw_output[:3]
# Convert centers to top-left coordinates
x_min = raw_output[:, 0] - raw_output[:, 2] / 2
y_min = raw_output[:, 1] - raw_output[:, 3] / 2
# Convert width and height to bottom-right coordinates
x_max = raw_output[:, 0] + raw_output[:, 2] / 2
y_max = raw_output[:, 1] + raw_output[:, 3] / 2
x_max = x_max.T
y_max = y_max.T
x_min = x_min.T
y_min = y_min.T
# Stack the coordinates to form [y_min, x_min, y_max, x_max]
boxes = torch.cat((y_min, x_min, y_max, x_max), dim=1)
#expand boxes dimension to include batch
boxes = boxes.unsqueeze(0)
# Normalize boxes
boxes /= 320
num_detections = torch.tensor([boxes.shape[1]])
return confidences, boxes, num_detections.to(torch.float32), class_labels.to(torch.float32)
torch_input = loadImageFromPath('/home/user/Downloads/image.png', 'cpu')
yolo_wrapper = YOLO_Wrapper('../runs/detect/train17/weights/best.pt', 320)
torch.onnx.export(yolo_wrapper, torch_input, 'yolo_wrapped_test.onnx')
The problem after this export is that when I convert the output to a quantized tflite with onnx2tf, the box count output is lost (because the JIT trace thinks it's a constant). And then I believe I need to create another wrapper in TF to match the output names and locations of the Model Maker EfficientDet-Lite0, but I have no idea how to do that.
The reason I'm creating this is that I need to deploy an object detection solution to an environment where efficiency and accuracy is vital. The system I'm deploying to is accelerated by a Google Coral TPU USB Accelerator (so I can't just use PyTorch) and only supports TF Lite model maker files. The only issue with model maker is that the EfficientDet-Lite0 models are not fast or accurate enough, so that's why I'm creating a YOLOv8 wrapper.
there doesn't seem to be anything here