添加注释

2024-08-07 09:32:38 +08:00
commit 5d9b9a6d9f
256 changed files with 19346 additions and 0 deletions
--- a/torch2trt/imgs/yolov5l-face.jpg
+++ b/torch2trt/imgs/yolov5l-face.jpg
--- a/torch2trt/imgs/yolov5m-face.jpg
+++ b/torch2trt/imgs/yolov5m-face.jpg
--- a/torch2trt/imgs/yolov5n-0.5.jpg
+++ b/torch2trt/imgs/yolov5n-0.5.jpg
--- a/torch2trt/imgs/yolov5n-face.jpg
+++ b/torch2trt/imgs/yolov5n-face.jpg
--- a/torch2trt/imgs/yolov5s-face.jpg
+++ b/torch2trt/imgs/yolov5s-face.jpg
--- a/torch2trt/main.py
+++ b/torch2trt/main.py
@@ -0,0 +1,98 @@
+import os
+import sys
+import cv2
+import copy
+import torch
+import argparse
+root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径：获取当前路径，再上级路径
+sys.path.append(root_path)  # 将项目根路径写入系统路径
+from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
+from utils.datasets import letterbox
+from detect_plate import scale_coords_landmarks,show_results
+from torch2trt.trt_model import TrtModel
+cur_path=os.path.abspath(os.path.dirname(__file__))
+def img_process(img_path,long_side=640,stride_max=32):
+    '''
+    图像预处理
+    '''
+    orgimg=cv2.imread(img_path)
+    img0 = copy.deepcopy(orgimg)
+    h0, w0 = orgimg.shape[:2]  # orig hw
+    r = long_side/ max(h0, w0)  # resize image to img_size
+    if r != 1:  # always resize down, only resize up if training with augmentation
+        interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
+        img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)
+
+    imgsz = check_img_size(long_side, s=stride_max)  # check img_size
+
+    img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形   False固定尺度
+    # Convert
+    img = img[:, :, ::-1].transpose(2, 0, 1).copy()  # BGR to RGB, to 3x416x416
+    img = torch.from_numpy(img)
+    img = img.float()  # uint8 to fp16/32
+    img /= 255.0  # 0 - 255 to 0.0 - 1.0
+    if img.ndimension() == 3:
+        img = img.unsqueeze(0)
+    return img,orgimg
+
+def img_vis(img,orgimg,pred,vis_thres = 0.6):
+    '''
+    预测可视化
+    vis_thres: 可视化阈值
+    '''
+
+    print('img.shape: ', img.shape)
+    print('orgimg.shape: ', orgimg.shape)
+
+    no_vis_nums=0
+    # Process detections
+    for i, det in enumerate(pred):  # detections per image
+        gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]]  # normalization gain whwh
+        gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]  # normalization gain landmarks
+        if len(det):
+            # Rescale boxes from img_size to im0 size
+            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()
+
+            # Print results
+            for c in det[:, -1].unique():
+                n = (det[:, -1] == c).sum()  # detections per class
+
+            det[:, 5:15] = scale_coords_landmarks(img.shape[2:], det[:, 5:15], orgimg.shape).round()
+
+            for j in range(det.size()[0]):
+                
+                
+                if det[j, 4].cpu().numpy() < vis_thres:
+                    no_vis_nums+=1
+                    continue
+
+                xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
+                conf = det[j, 4].cpu().numpy()
+                landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist()
+                class_num = det[j, 15].cpu().numpy()
+                orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
+
+    cv2.imwrite(cur_path+'/result.jpg', orgimg)
+    print('result save in '+cur_path+'/result.jpg')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--img_path', type=str, default=cur_path+"/sample.jpg", help='img path') 
+    parser.add_argument('--trt_path', type=str, required=True, help='trt_path') 
+    parser.add_argument('--output_shape', type=list, default=[1,25200,16], help='input[1,3,640,640] ->  output[1,25200,16]') 
+    opt = parser.parse_args()
+
+
+    img,orgimg=img_process(opt.img_path) 
+    model=TrtModel(opt.trt_path)
+    pred=model(img.numpy()).reshape(opt.output_shape) # forward
+    model.destroy()
+
+    # Apply NMS
+    pred = non_max_suppression_face(torch.from_numpy(pred), conf_thres=0.3, iou_thres=0.5)
+   
+    # ============可视化================
+    img_vis(img,orgimg,pred)
+
+
--- a/torch2trt/readme.md
+++ b/torch2trt/readme.md
@@ -0,0 +1,68 @@
+English | [简体中文](readme_CN.md)
+
+
+
+# Overall process
+
+## 1.Pytorch->TensorRT
+
+ ```shell
+ python export.py --weights "torch's path" --onnx2trt  --fp16_trt 
+ ```
+
+
+## 2.TensorRT inference
+```shell
+python torch2trt/main.py --trt_path "trt's path"
+```
+Image preprocessing -> TensorRT inference -> visualization 
+
+
+
+# Time-consuming comparison
+
+| Backbone |Pytorch(ms) |TensorRT_FP16(ms) |
+|:---:|:----:|:----:|
+|yolov5n-0.5|     7.7     |        2.1        |
+|yolov5n-face|     7.7     |        2.4        |
+|yolov5s-face|     5.6     |        2.2        |
+|yolov5m-face|     9.9     |        3.3        |
+|yolov5l-face|    15.9     |        4.5        |
+
+>   Pytorch=1.10.0+cu102    TensorRT=8.2.0.6   Hardware=rtx2080ti
+
+```shell
+python torch2trt/speed.py --torch_path "torch's path" --trt_path "trt's path"
+```
+
+
+
+# Visualization
+
+<table>
+    <tr>
+            <th>yolov5n-0.5</th>
+            <th>yolov5n-face</th>
+    </tr>
+    <tr>
+        <td><img src="./imgs/yolov5n-0.5.jpg" /></td>
+        <td><img src="./imgs/yolov5n-face.jpg" /></td>
+    </tr>
+</table>
+
+<table>
+    <tr>
+            <th>yolov5s-face</th>
+            <th>yolov5m-face</th>
+            <th>yolov5l-face</th>
+    </tr>
+    <tr>
+        <td><img src="./imgs/yolov5s-face.jpg" /></td>
+        <td><img src="./imgs/yolov5m-face.jpg" /></td>
+        <td><img src="./imgs/yolov5l-face.jpg" /></td>
+    </tr>
+</table>
+
+
+
+
--- a/torch2trt/readme_CN.md
+++ b/torch2trt/readme_CN.md
@@ -0,0 +1,65 @@
+
+
+# 整体流程
+
+## 1.Pytorch->TensorRT
+
+ ```shell
+ python export.py --weights "torch权重路径" --onnx2trt  --fp16_trt 
+ ```
+
+
+## 2.TensorRT推理
+```shell
+python torch2trt/main.py --trt_path "trt权重路径"
+```
+
+图像预处理 -> TensorRT推理 -> 可视化结果
+
+
+
+# 耗时对比
+
+|  | Pytorch(ms) | TensorRT_FP16(ms) |
+|:---:|:----:|:----:|
+| yolov5n-0.5  |     7.7     |        2.1        |
+| yolov5n-face |     7.7     |        2.4        |
+| yolov5s-face |     5.6     |        2.2        |
+| yolov5m-face |     9.9     |        3.3        |
+| yolov5l-face |    15.9     |        4.5        |
+
+>  Pytorch=1.10.0+cu102     TensorRT=8.2.0.6     Hardware=rtx2080ti
+
+```shell
+python torch2trt/speed.py --torch_path "torch权重路径" --trt_path "trt权重路径"
+```
+
+
+
+# 可视化
+
+<table>
+    <tr>
+            <th>yolov5n-0.5</th>
+            <th>yolov5n-face</th>
+    </tr>
+    <tr>
+        <td><img src="./imgs/yolov5n-0.5.jpg" /></td>
+        <td><img src="./imgs/yolov5n-face.jpg" /></td>
+    </tr>
+</table>
+
+<table>
+    <tr>
+            <th>yolov5s-face</th>
+            <th>yolov5m-face</th>
+            <th>yolov5l-face</th>
+    </tr>
+    <tr>
+        <td><img src="./imgs/yolov5s-face.jpg" /></td>
+        <td><img src="./imgs/yolov5m-face.jpg" /></td>
+        <td><img src="./imgs/yolov5l-face.jpg" /></td>
+    </tr>
+</table>
+
+
--- a/torch2trt/sample.jpg
+++ b/torch2trt/sample.jpg
--- a/torch2trt/speed.py
+++ b/torch2trt/speed.py
@@ -0,0 +1,49 @@
+from models.experimental import attempt_load
+from torch2trt.trt_model import TrtModel
+import argparse
+import torch
+import time
+from tqdm import tqdm
+
+
+def run(model,img,warmup_iter,iter):
+    
+    
+    print('start warm up...')
+    for _ in tqdm(range(warmup_iter)):
+        model(img) 
+    
+   
+    print('start calculate...')
+    torch.cuda.synchronize()
+    start = time.time()
+    for __ in tqdm(range(iter)):
+        model(img) 
+        torch.cuda.synchronize()
+    end = time.time()
+    return ((end - start) * 1000)/float(iter)
+    
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--torch_path', type=str,required=True, help='torch weights path')  
+    parser.add_argument('--trt_path', type=str,required=True, help='tensorrt weights path')
+
+    parser.add_argument('--device', type=int,default=0, help='cuda device')
+    parser.add_argument('--img_shape', type=list,default=[1,3,640,640], help='tensorrt weights path')
+    parser.add_argument('--warmup_iter', type=int, default=100,help='warm up iter')  
+    parser.add_argument('--iter', type=int, default=300,help='average elapsed time of iterations')  
+    opt = parser.parse_args()
+
+
+    # -----------------------torch-----------------------------------------
+    img = torch.zeros(opt.img_shape)
+    model = attempt_load(opt.torch_path, map_location=torch.device('cpu'))  # load FP32 model
+    model.eval()
+    total_time=run(model.to(opt.device),img.to(opt.device),opt.warmup_iter,opt.iter)
+    print('Pytorch is  %.2f ms/img'%total_time)
+
+    # -----------------------tensorrt-----------------------------------------
+    model=TrtModel(opt.trt_path)
+    total_time=run(model,img.numpy(),opt.warmup_iter,opt.iter)
+    model.destroy()
+    print('TensorRT is  %.2f ms/img'%total_time)
--- a/torch2trt/trt_model.py
+++ b/torch2trt/trt_model.py
@@ -0,0 +1,118 @@
+import pycuda.autoinit
+import pycuda.driver as cuda
+import tensorrt as trt
+import numpy as np
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+def GiB(val):
+    return val * 1 << 30
+
+def ONNX_to_TRT(onnx_model_path=None,trt_engine_path=None,fp16_mode=False):
+    """
+    仅适用TensorRT V8版本
+    生成cudaEngine，并保存引擎文件(仅支持固定输入尺度)  
+    
+    fp16_mode: True则fp16预测
+    onnx_model_path: 将加载的onnx权重路径
+    trt_engine_path: trt引擎文件保存路径
+    """
+    builder = trt.Builder(TRT_LOGGER)
+    network = builder.create_network(EXPLICIT_BATCH)
+    parser = trt.OnnxParser(network, TRT_LOGGER)
+    
+    config = builder.create_builder_config()
+    config.max_workspace_size=GiB(1) 
+    if fp16_mode:
+        config.set_flag(trt.BuilderFlag.FP16) 
+    with open(onnx_model_path, 'rb') as model:
+        assert parser.parse(model.read())
+        serialized_engine=builder.build_serialized_network(network, config)
+
+   
+    with open(trt_engine_path, 'wb') as f:
+        f.write(serialized_engine)  # 序列化
+
+    print('TensorRT file in ' + trt_engine_path)
+    print('============ONNX->TensorRT SUCCESS============')
+
+class TrtModel():
+    '''
+    TensorRT infer
+    '''
+    def __init__(self,trt_path):
+        self.ctx=cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(trt_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        bindings = []
+
+        for binding in engine:
+            print('bingding:', binding, engine.get_binding_shape(binding))
+            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+            dtype = trt.nptype(engine.get_binding_dtype(binding))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            bindings.append(int(cuda_mem))
+            # Append to the appropriate list.
+            if engine.binding_is_input(binding):
+                self.input_w = engine.get_binding_shape(binding)[-1]
+                self.input_h = engine.get_binding_shape(binding)[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            else:
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.bindings = bindings
+        self.batch_size = engine.max_batch_size
+    
+    def __call__(self,img_np_nchw):
+        '''
+        TensorRT推理
+        :param img_np_nchw: 输入图像
+        '''
+        self.ctx.push()
+
+        # Restore
+        stream = self.stream
+        context = self.context
+        engine = self.engine
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        bindings = self.bindings
+
+        np.copyto(host_inputs[0], img_np_nchw.ravel())
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        stream.synchronize()
+        self.ctx.pop()
+        return host_outputs[0]
+
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()