MLK-22606 VPU Decoder: refine decode performance info

author Shijie Qin <shijie.qin@nxp.com>

Wed, 18 Sep 2019 02:43:34 +0000 (10:43 +0800)

committer Shijie Qin <shijie.qin@nxp.com>

Thu, 24 Oct 2019 10:53:20 +0000 (18:53 +0800)
author Shijie Qin <shijie.qin@nxp.com>
Wed, 18 Sep 2019 02:43:34 +0000 (10:43 +0800)
committer Shijie Qin <shijie.qin@nxp.com>
Thu, 24 Oct 2019 10:53:20 +0000 (18:53 +0800)
diff --git a/drivers/mxc/vpu_malone/vpu_b0.c b/drivers/mxc/vpu_malone/vpu_b0.c

index a0ed963..d354b68 100644 (file)
--- a/drivers/mxc/vpu_malone/vpu_b0.c
+++ b/drivers/mxc/vpu_malone/vpu_b0.c
@@ -95,7 +95,7 @@ static int send_abort_cmd(struct vpu_ctx *ctx);
  static int send_stop_cmd(struct vpu_ctx *ctx);
  static int vpu_dec_cmd_reset(struct vpu_ctx *ctx);
  static void vpu_dec_event_decode_error(struct vpu_ctx *ctx);
-static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, bool start_flag);
+static void vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, const char *str);
  
  #define CHECK_BIT(var, pos) (((var) >> (pos)) & 1)
  
@@ -978,6 +978,11 @@ static int v4l2_ioctl_s_fmt(struct file *file,
         q_data->rect.width = pix_mp->width;
         q_data->rect.height = pix_mp->height;
  
+       if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
+               vpu_calculate_performance(ctx, 0xff, "capture set fmt");
+       else
+               vpu_calculate_performance(ctx, 0xff, "output set fmt");
+
         return 0;
  }
  
@@ -1008,6 +1013,11 @@ static int vpu_dec_queue_reqbufs(struct queue_data *queue,
                         queue->type ? "CAPTURE" : "OUTPUT",
                         reqbuf->count ? "request" : "free");
  
+       if (queue->type == V4L2_SRC)
+               vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "output request buffer begin" : "output free buffer begin");
+       else
+               vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "capture request buffer begin" : "capture free buffer begin");
+
         if (queue->vb2_q_inited)
                 ret = vb2_reqbufs(&queue->vb2_q, reqbuf);
  
@@ -1015,6 +1025,10 @@ static int vpu_dec_queue_reqbufs(struct queue_data *queue,
                         ctx->str_index,
                         queue->type ? "CAPTURE" : "OUTPUT",
                         reqbuf->count ? "request" : "free");
+       if (queue->type == V4L2_SRC)
+               vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "output request buffer finish" : "output free buffer finish");
+       else
+               vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "capture request buffer finish" : "capture free buffer finish");
         up(&queue->drv_q_lock);
  
         return ret;
@@ -2978,12 +2992,13 @@ static void v4l2_transfer_buffer_to_firmware(struct queue_data *This, struct vb2
         int ret;
  
         if (ctx->start_flag == true) {
-               ctx->perf_time.first_feed_interv = vpu_calculate_performance(ctx, 0xff, false);
+               vpu_calculate_performance(ctx, 0xff, "alloc stream buffer begin");
                 ret = alloc_vpu_buffer(ctx);
                 if (ret) {
                         vpu_err("alloc vpu buffer fail\n");
                         return;
                 }
+               vpu_calculate_performance(ctx, 0xff, "alloc stream buffer finish");
  
                 frame_bytes = transfer_buffer_to_firmware(ctx,
                                                           vb,
@@ -2992,7 +3007,7 @@ static void v4l2_transfer_buffer_to_firmware(struct queue_data *This, struct vb2
                 if (vb->planes[0].bytesused < vb->planes[0].length)
                         vpu_dbg(LVL_INFO, "v4l2_transfer_buffer_to_firmware - set stream_feed_complete - DEBUG 1\n");
  #endif
-               ctx->perf_time.start_cmd_interv = vpu_calculate_performance(ctx, 0xff, false);
+               vpu_calculate_performance(ctx, 0xff, "send start cmd");
                 v4l2_vpu_send_cmd(ctx, ctx->str_index, VID_API_CMD_START, 0, NULL);
                 p_data_req = list_first_entry(&This->drv_q,
                                 typeof(*p_data_req), list);
@@ -3856,6 +3871,7 @@ static void check_wait_res_changed(struct vpu_ctx *ctx)
                                 ctx->res_change_occu_count,
                                 ctx->res_change_send_count,
                                 ctx->res_change_done_count);
+                               vpu_calculate_performance(ctx, 0xff, "first provide buffer");
                         break;
                 }
         }
@@ -3990,51 +4006,66 @@ static bool check_is_need_reset_after_abort(struct vpu_ctx *ctx)
         return false;
  }
  
-static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, bool start_flag)
+static struct vpu_dec_perf_queue *get_dec_perf_queue(struct vpu_ctx *ctx)
+{
+       struct vpu_dec_perf_queue *perf;
+
+       perf = vzalloc(sizeof(*perf));
+       if (!perf)
+               return NULL;
+
+       atomic64_add(sizeof(*perf), &ctx->statistic.total_alloc_size);
+
+       return perf;
+}
+
+static void put_dec_perf_queue(struct vpu_ctx *ctx, struct vpu_dec_perf_queue *perf)
+{
+       if (!perf)
+               return;
+
+       vfree(perf);
+       atomic64_sub(sizeof(*perf), &ctx->statistic.total_alloc_size);
+}
+
+static void cleanup_perf_queue(struct vpu_ctx *ctx)
+{
+       struct vpu_dec_perf_queue *perf;
+       struct vpu_dec_perf_queue *tmp;
+
+       mutex_lock(&ctx->perf_lock);
+       list_for_each_entry_safe(perf, tmp, &ctx->perf_q, list) {
+               list_del_init(&perf->list);
+               put_dec_perf_queue(ctx, perf);
+       }
+       mutex_unlock(&ctx->perf_lock);
+}
+
+static void vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, const char *str)
  {
         u_int64 Time;
-       u_int64 interv = 0;
         u_int64 total_Time;
         struct timeval tv;
+       struct vpu_dec_perf_queue *perf;
  
         if (!vpu_show_perf_ena)
-               return 0;
+               return;
         if (!(vpu_show_perf_idx & (1<<ctx->str_index)))
-               return 0;
+               return;
  
         do_gettimeofday(&tv);
         Time = ((tv.tv_sec * 1000000ULL) + tv.tv_usec) / 1000ULL;
  
-       if (start_flag) {
-               ctx->perf_time.open_time = Time;
-               ctx->perf_time.last_time = Time;
-               return interv;
-       }
-
-       interv = Time - ctx->perf_time.last_time;
-
         switch (uEvent) {
-       case VID_API_EVENT_START_DONE:
-               ctx->perf_time.start_done_interv = interv;
-               break;
-       case VID_API_EVENT_SEQ_HDR_FOUND:
-               if (ctx->b_firstseq) {
-                       ctx->perf_time.seq_hdr_found_interv = interv;
-
-                       ctx->perf_time.first_decoded_time = Time;
-                       ctx->perf_time.last_decoded_time = Time;
-                       ctx->perf_time.first_ready_time = Time;
-                       ctx->perf_time.last_ready_time = Time;
-               }
-
-               break;
         case VID_API_EVENT_PIC_DECODED:
-               if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1)
-                       ctx->perf_time.first_decoded_interv = Time - ctx->perf_time.last_decoded_time;
+               if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1) {
+                       ctx->perf_time.first_decoded_time = Time - 1;
+                       ctx->perf_time.last_decoded_time = Time - 1;
+               }
  
                 ctx->perf_time.cur_decoded_interv = Time - ctx->perf_time.last_decoded_time;
                 total_Time = Time - ctx->perf_time.first_decoded_time;
-               ctx->perf_time.decoded_fps = ctx->statistic.event[VID_API_EVENT_PIC_DECODED] / (total_Time / 1000ULL);
+               ctx->perf_time.decoded_fps = ctx->statistic.event[VID_API_EVENT_PIC_DECODED] * 1000ULL / total_Time;
                 ctx->perf_time.last_decoded_time = Time;
                 if (vpu_show_perf_ent & VPU_DECODED_EVENT_PERF_MASK)
                         vpu_dbg(LVL_WARN, "[%2d] dec[%8ld]  interv: %8ld ms  fps: %ld\n",
@@ -4045,12 +4076,14 @@ static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, b
  
                 break;
         case VID_API_EVENT_FRAME_BUFF_RDY:
-               if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1)
-                       ctx->perf_time.first_ready_interv = Time - ctx->perf_time.last_ready_time;
+               if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1) {
+                       ctx->perf_time.first_ready_time = Time - 1;
+                       ctx->perf_time.last_ready_time = Time - 1;
+               }
  
                 ctx->perf_time.cur_ready_interv = Time - ctx->perf_time.last_ready_time;
                 total_Time = Time - ctx->perf_time.first_ready_time;
-               ctx->perf_time.ready_fps = ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] / (total_Time / 1000ULL);
+               ctx->perf_time.ready_fps = ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] * 1000ULL / total_Time;
                 ctx->perf_time.last_ready_time = Time;
                 if (vpu_show_perf_ent & VPU_READY_EVENT_PERF_MASK)
                         vpu_dbg(LVL_WARN, "[%2d] rdy[%8ld]  interv: %8ld ms  fps: %ld\n",
@@ -4064,8 +4097,23 @@ static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, b
                 break;
         }
  
-       ctx->perf_time.last_time = Time;
-       return interv;
+       if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] > 1)
+               return;
+
+       if (str != NULL) {
+               perf = get_dec_perf_queue(ctx);
+               if (!perf) {
+                       vpu_err("get_dec_perf_queue fail\n");
+                       return;
+               }
+
+               scnprintf(perf->str, (VPU_MAX_STEP_STRING_LENGTH - 1), str);
+               perf->time = Time;
+
+               mutex_lock(&ctx->perf_lock);
+               list_add_tail(&perf->list, &ctx->perf_q);
+               mutex_unlock(&ctx->perf_lock);
+       }
  }
  
  static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 uEvent, u_int32 *event_data)
@@ -4108,7 +4156,7 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
                 ctx->firmware_stopped = false;
                 ctx->firmware_finished = false;
                 ctx->req_frame_count = 0;
-               vpu_calculate_performance(ctx, uEvent, false);
+               vpu_calculate_performance(ctx, uEvent, "start done");
                 break;
         case VID_API_EVENT_STOPPED: {
                 vpu_dbg(LVL_INFO, "receive VID_API_EVENT_STOPPED\n");
@@ -4211,7 +4259,10 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
                 vpu_dec_valid_ts(ctx, consumed_pic_bytesused, p_data_req);
                 This->process_count++;
                 }
-               vpu_calculate_performance(ctx, uEvent, false);
+               if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1)
+                       vpu_calculate_performance(ctx, uEvent, "first decoded");
+               else
+                       vpu_calculate_performance(ctx, uEvent, NULL);
  
                 ctx->frm_dec_delay--;
                 ctx->fifo_low = false;
@@ -4235,7 +4286,6 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
                 if (ctx->wait_res_change_done)
                         vpu_dbg(LVL_WARN, "warning: ctx[%d] update seq info when waiting res change\n",
                                 ctx->str_index);
-               vpu_calculate_performance(ctx, uEvent, false);
  
                 down(&ctx->q_data[V4L2_DST].drv_q_lock);
                 respond_req_frame(ctx, &ctx->q_data[V4L2_DST], true);
@@ -4279,6 +4329,7 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
                         send_source_change_event(ctx);
                         pStreamPitchInfo->uFramePitch = 0x4000;
                         ctx->b_firstseq = false;
+                       vpu_calculate_performance(ctx, uEvent, "seq_hdr_found");
                 }
                 }
                 break;
@@ -4385,7 +4436,10 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
                 u_int32 *FrameInfo = (u_int32 *)event_data;
  
                 report_buffer_done(ctx, FrameInfo);
-               vpu_calculate_performance(ctx, uEvent, false);
+               if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1)
+                       vpu_calculate_performance(ctx, uEvent, "first ready");
+               else
+                       vpu_calculate_performance(ctx, uEvent, NULL);
         }
                 break;
         case VID_API_EVENT_CHUNK_DECODED:
@@ -4643,6 +4697,7 @@ static void release_vpu_ctx(struct vpu_ctx *ctx)
         release_queue_data(ctx);
         free_decoder_buffer(ctx);
         destroy_log_info_queue(ctx);
+       cleanup_perf_queue(ctx);
  
         if (atomic64_read(&ctx->statistic.total_alloc_size) != 0)
                 vpu_err("error: memory leak for vpu kalloc buffer\n");
@@ -4651,6 +4706,7 @@ static void release_vpu_ctx(struct vpu_ctx *ctx)
  
         mutex_destroy(&ctx->instance_mutex);
         mutex_destroy(&ctx->cmd_lock);
+       mutex_destroy(&ctx->perf_lock);
         clear_bit(ctx->str_index, &ctx->dev->instance_mask);
         ctx->dev->ctx[ctx->str_index] = NULL;
         pm_runtime_put_sync(ctx->dev->generic_dev);
@@ -5516,6 +5572,13 @@ static ssize_t show_instance_perf_info(struct device *dev,
         struct vpu_ctx *ctx;
         struct vpu_statistic *statistic;
         int num = 0;
+       u_int64 start_time = 0;
+       u_int64 prev_time = 0;
+       u_int64 interv = 0;
+       u_int64 total = 0;
+       struct vpu_dec_perf_queue *perf;
+       struct vpu_dec_perf_queue *tmp;
+       int i = 0;
  
         ctx = container_of(attr, struct vpu_ctx, dev_attr_instance_perf);
         statistic = &ctx->statistic;
@@ -5526,34 +5589,38 @@ static ssize_t show_instance_perf_info(struct device *dev,
                 return num;
  
         num += scnprintf(buf + num, PAGE_SIZE - num, "beginning:\n");
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-                       "feed first frame",
-                       ctx->perf_time.first_feed_interv);
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-                       "send start cmd",
-                       ctx->perf_time.start_cmd_interv);
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-                       "start_done event",
-                       ctx->perf_time.start_done_interv);
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-                       "seq_hdr_found event",
-                       ctx->perf_time.seq_hdr_found_interv);
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-                       "first decoded",
-                       ctx->perf_time.first_decoded_interv);
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-                       "first ready",
-                       ctx->perf_time.first_ready_interv);
+       num += scnprintf(
+               buf + num, PAGE_SIZE - num,
+               "unit: ms \t\t time-point \t   interval    total\n");
+
+       mutex_lock(&ctx->perf_lock);
+       list_for_each_entry_safe(perf, tmp, &ctx->perf_q, list) {
+               if (i == 0) {
+                       start_time = perf->time;
+                       prev_time = perf->time;
+               }
+
+               interv = perf->time - prev_time;
+               total = perf->time - start_time;
+               num += scnprintf(buf + num, PAGE_SIZE - num,
+                                "%40s: %8ld %8ld\n", perf->str, interv, total);
+               prev_time = perf->time;
+
+               if (++i > 50) {
+                       num += scnprintf(buf + num, PAGE_SIZE - num,
+                               "Too many initialization steps, omitting the following\n");
+                       break;
+               }
+       }
+       mutex_unlock(&ctx->perf_lock);
  
         num += scnprintf(buf + num, PAGE_SIZE - num, "decoded:\n");
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t dec[%8ld]: %8ld ms \t average: %8ld fps\n",
+       num += scnprintf(buf + num, PAGE_SIZE - num, "\t count: %8ld \t fps: %8ld\n",
                         ctx->statistic.event[VID_API_EVENT_PIC_DECODED],
-                       ctx->perf_time.cur_decoded_interv,
                         ctx->perf_time.decoded_fps);
         num += scnprintf(buf + num, PAGE_SIZE - num, "ready:\n");
-       num += scnprintf(buf + num, PAGE_SIZE - num, "\t rdy[%8ld]: %8ld ms \t average: %8ld fps\n",
+       num += scnprintf(buf + num, PAGE_SIZE - num, "\t count: %8ld \t fps: %8ld\n",
                         ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY],
-                       ctx->perf_time.cur_ready_interv,
                         ctx->perf_time.ready_fps);
  
         return num;
@@ -5854,9 +5921,10 @@ static int v4l2_open(struct file *filp)
  
         mutex_init(&ctx->instance_mutex);
         mutex_init(&ctx->cmd_lock);
+       mutex_init(&ctx->perf_lock);
         if (kfifo_alloc(&ctx->msg_fifo,
-                               sizeof(struct event_msg) * VID_API_MESSAGE_LIMIT,
-                               GFP_KERNEL)) {
+                       sizeof(struct event_msg) * VID_API_MESSAGE_LIMIT,
+                       GFP_KERNEL)) {
                 vpu_err("fail to alloc fifo when open\n");
                 ret = -ENOMEM;
                 goto err_alloc_fifo;
@@ -5880,6 +5948,7 @@ static int v4l2_open(struct file *filp)
         ctx->hang_status = false;
         ctx->first_dump_data_flag = true;
         INIT_LIST_HEAD(&ctx->cmd_q);
+       INIT_LIST_HEAD(&ctx->perf_q);
         ctx->tsm = createTSManager(tsm_buffer_size);
         if (!ctx->tsm)
                 goto err_create_tsm;
@@ -5927,12 +5996,13 @@ static int v4l2_open(struct file *filp)
  
         vpu_dbg(LVL_BIT_FLOW, "<%d> ctx[%d] open\n",
                         current->pid, ctx->str_index);
-       vpu_calculate_performance(ctx, 0xff, true);
+       vpu_calculate_performance(ctx, 0xff, "open device");
  
         return 0;
  
  err_firmware_load:
         destroy_log_info_queue(ctx);
+       cleanup_perf_queue(ctx);
         release_queue_data(ctx);
  
         if (vpu_frmcrcdump_ena)
@@ -5946,6 +6016,9 @@ err_create_tsm:
         kfifo_free(&ctx->msg_fifo);
         dev->ctx[idx] = NULL;
  err_alloc_fifo:
+       mutex_destroy(&ctx->instance_mutex);
+       mutex_destroy(&ctx->cmd_lock);
+       mutex_destroy(&ctx->perf_lock);
         destroy_workqueue(ctx->instance_wq);
  err_alloc_wq:
         ctrls_delete_decoder(ctx);
diff --git a/drivers/mxc/vpu_malone/vpu_b0.h b/drivers/mxc/vpu_malone/vpu_b0.h

index 66d24f7..11801d0 100644 (file)
--- a/drivers/mxc/vpu_malone/vpu_b0.h
+++ b/drivers/mxc/vpu_malone/vpu_b0.h
@@ -56,6 +56,7 @@ extern unsigned int vpu_dbg_level_decoder;
  #define MAX_MBI_NUM 18 // same with MEDIA_PLAYER_MAX_MBI_UNIT defined in firmware
  #define MAX_TIMEOUT_COUNT 10
  #define VPU_REG_BASE 0x40000000
+#define VPU_MAX_STEP_STRING_LENGTH 40
  
  #define V4L2_PIX_FMT_NV12_10BIT    v4l2_fourcc('N', 'T', '1', '2') /*  Y/CbCr 4:2:0 for 10bit  */
  #define INVALID_FRAME_DEPTH -1
@@ -321,13 +322,6 @@ struct vpu_dec_cmd_request {
  
  struct vpu_dec_perf_time {
         u_int64 open_time;
-       u_int64 last_time;
-       u_int64 first_feed_interv;
-       u_int64 start_cmd_interv;
-       u_int64 start_done_interv;
-       u_int64 seq_hdr_found_interv;
-       u_int64 first_decoded_interv;
-       u_int64 first_ready_interv;
  
         u_int64 first_decoded_time;
         u_int64 last_decoded_time;
@@ -340,6 +334,12 @@ struct vpu_dec_perf_time {
         u_int64 ready_fps;
  };
  
+struct vpu_dec_perf_queue {
+       struct list_head list;
+       char str[VPU_MAX_STEP_STRING_LENGTH];
+       u_int64 time;
+};
+
  struct vpu_ctx {
         struct vpu_dev *dev;
         struct v4l2_fh fh;
@@ -432,6 +432,9 @@ struct vpu_ctx {
         int res_change_occu_count;
         int res_change_send_count;
         int res_change_done_count;
+
+       struct list_head perf_q;
+       struct mutex perf_lock;
  };
  
  #define LVL_WARN               (1 << 0)
author	Shijie Qin <shijie.qin@nxp.com>
	Wed, 18 Sep 2019 02:43:34 +0000 (10:43 +0800)
committer	Shijie Qin <shijie.qin@nxp.com>
	Thu, 24 Oct 2019 10:53:20 +0000 (18:53 +0800)
drivers/mxc/vpu_malone/vpu_b0.c		patch \| blob \| history
drivers/mxc/vpu_malone/vpu_b0.h		patch \| blob \| history