From 0bab60e68f71ed9638b2e7759baf09f5572f818a Mon Sep 17 00:00:00 2001 From: Shijie Qin Date: Wed, 18 Sep 2019 10:43:34 +0800 Subject: [PATCH] MLK-22606 VPU Decoder: refine decode performance info 1. change to use list record performance info for better trace each time-point and not limited to a fixed flow. 2. add total time from open device to each time-point. Signed-off-by: Shijie Qin Reviewed-by: Zhou Peng Reviewed-by: ming_qian (cherry picked from commit 563d17921b33e3b1be96414ef80535146f00edb5) --- drivers/mxc/vpu_malone/vpu_b0.c | 205 ++++++++++++++++++++++---------- drivers/mxc/vpu_malone/vpu_b0.h | 17 +-- 2 files changed, 149 insertions(+), 73 deletions(-) diff --git a/drivers/mxc/vpu_malone/vpu_b0.c b/drivers/mxc/vpu_malone/vpu_b0.c index a0ed963b53b4..d354b68c90ab 100644 --- a/drivers/mxc/vpu_malone/vpu_b0.c +++ b/drivers/mxc/vpu_malone/vpu_b0.c @@ -95,7 +95,7 @@ static int send_abort_cmd(struct vpu_ctx *ctx); static int send_stop_cmd(struct vpu_ctx *ctx); static int vpu_dec_cmd_reset(struct vpu_ctx *ctx); static void vpu_dec_event_decode_error(struct vpu_ctx *ctx); -static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, bool start_flag); +static void vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, const char *str); #define CHECK_BIT(var, pos) (((var) >> (pos)) & 1) @@ -978,6 +978,11 @@ static int v4l2_ioctl_s_fmt(struct file *file, q_data->rect.width = pix_mp->width; q_data->rect.height = pix_mp->height; + if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) + vpu_calculate_performance(ctx, 0xff, "capture set fmt"); + else + vpu_calculate_performance(ctx, 0xff, "output set fmt"); + return 0; } @@ -1008,6 +1013,11 @@ static int vpu_dec_queue_reqbufs(struct queue_data *queue, queue->type ? "CAPTURE" : "OUTPUT", reqbuf->count ? "request" : "free"); + if (queue->type == V4L2_SRC) + vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "output request buffer begin" : "output free buffer begin"); + else + vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "capture request buffer begin" : "capture free buffer begin"); + if (queue->vb2_q_inited) ret = vb2_reqbufs(&queue->vb2_q, reqbuf); @@ -1015,6 +1025,10 @@ static int vpu_dec_queue_reqbufs(struct queue_data *queue, ctx->str_index, queue->type ? "CAPTURE" : "OUTPUT", reqbuf->count ? "request" : "free"); + if (queue->type == V4L2_SRC) + vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "output request buffer finish" : "output free buffer finish"); + else + vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "capture request buffer finish" : "capture free buffer finish"); up(&queue->drv_q_lock); return ret; @@ -2978,12 +2992,13 @@ static void v4l2_transfer_buffer_to_firmware(struct queue_data *This, struct vb2 int ret; if (ctx->start_flag == true) { - ctx->perf_time.first_feed_interv = vpu_calculate_performance(ctx, 0xff, false); + vpu_calculate_performance(ctx, 0xff, "alloc stream buffer begin"); ret = alloc_vpu_buffer(ctx); if (ret) { vpu_err("alloc vpu buffer fail\n"); return; } + vpu_calculate_performance(ctx, 0xff, "alloc stream buffer finish"); frame_bytes = transfer_buffer_to_firmware(ctx, vb, @@ -2992,7 +3007,7 @@ static void v4l2_transfer_buffer_to_firmware(struct queue_data *This, struct vb2 if (vb->planes[0].bytesused < vb->planes[0].length) vpu_dbg(LVL_INFO, "v4l2_transfer_buffer_to_firmware - set stream_feed_complete - DEBUG 1\n"); #endif - ctx->perf_time.start_cmd_interv = vpu_calculate_performance(ctx, 0xff, false); + vpu_calculate_performance(ctx, 0xff, "send start cmd"); v4l2_vpu_send_cmd(ctx, ctx->str_index, VID_API_CMD_START, 0, NULL); p_data_req = list_first_entry(&This->drv_q, typeof(*p_data_req), list); @@ -3856,6 +3871,7 @@ static void check_wait_res_changed(struct vpu_ctx *ctx) ctx->res_change_occu_count, ctx->res_change_send_count, ctx->res_change_done_count); + vpu_calculate_performance(ctx, 0xff, "first provide buffer"); break; } } @@ -3990,51 +4006,66 @@ static bool check_is_need_reset_after_abort(struct vpu_ctx *ctx) return false; } -static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, bool start_flag) +static struct vpu_dec_perf_queue *get_dec_perf_queue(struct vpu_ctx *ctx) +{ + struct vpu_dec_perf_queue *perf; + + perf = vzalloc(sizeof(*perf)); + if (!perf) + return NULL; + + atomic64_add(sizeof(*perf), &ctx->statistic.total_alloc_size); + + return perf; +} + +static void put_dec_perf_queue(struct vpu_ctx *ctx, struct vpu_dec_perf_queue *perf) +{ + if (!perf) + return; + + vfree(perf); + atomic64_sub(sizeof(*perf), &ctx->statistic.total_alloc_size); +} + +static void cleanup_perf_queue(struct vpu_ctx *ctx) +{ + struct vpu_dec_perf_queue *perf; + struct vpu_dec_perf_queue *tmp; + + mutex_lock(&ctx->perf_lock); + list_for_each_entry_safe(perf, tmp, &ctx->perf_q, list) { + list_del_init(&perf->list); + put_dec_perf_queue(ctx, perf); + } + mutex_unlock(&ctx->perf_lock); +} + +static void vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, const char *str) { u_int64 Time; - u_int64 interv = 0; u_int64 total_Time; struct timeval tv; + struct vpu_dec_perf_queue *perf; if (!vpu_show_perf_ena) - return 0; + return; if (!(vpu_show_perf_idx & (1<str_index))) - return 0; + return; do_gettimeofday(&tv); Time = ((tv.tv_sec * 1000000ULL) + tv.tv_usec) / 1000ULL; - if (start_flag) { - ctx->perf_time.open_time = Time; - ctx->perf_time.last_time = Time; - return interv; - } - - interv = Time - ctx->perf_time.last_time; - switch (uEvent) { - case VID_API_EVENT_START_DONE: - ctx->perf_time.start_done_interv = interv; - break; - case VID_API_EVENT_SEQ_HDR_FOUND: - if (ctx->b_firstseq) { - ctx->perf_time.seq_hdr_found_interv = interv; - - ctx->perf_time.first_decoded_time = Time; - ctx->perf_time.last_decoded_time = Time; - ctx->perf_time.first_ready_time = Time; - ctx->perf_time.last_ready_time = Time; - } - - break; case VID_API_EVENT_PIC_DECODED: - if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1) - ctx->perf_time.first_decoded_interv = Time - ctx->perf_time.last_decoded_time; + if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1) { + ctx->perf_time.first_decoded_time = Time - 1; + ctx->perf_time.last_decoded_time = Time - 1; + } ctx->perf_time.cur_decoded_interv = Time - ctx->perf_time.last_decoded_time; total_Time = Time - ctx->perf_time.first_decoded_time; - ctx->perf_time.decoded_fps = ctx->statistic.event[VID_API_EVENT_PIC_DECODED] / (total_Time / 1000ULL); + ctx->perf_time.decoded_fps = ctx->statistic.event[VID_API_EVENT_PIC_DECODED] * 1000ULL / total_Time; ctx->perf_time.last_decoded_time = Time; if (vpu_show_perf_ent & VPU_DECODED_EVENT_PERF_MASK) vpu_dbg(LVL_WARN, "[%2d] dec[%8ld] interv: %8ld ms fps: %ld\n", @@ -4045,12 +4076,14 @@ static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, b break; case VID_API_EVENT_FRAME_BUFF_RDY: - if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1) - ctx->perf_time.first_ready_interv = Time - ctx->perf_time.last_ready_time; + if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1) { + ctx->perf_time.first_ready_time = Time - 1; + ctx->perf_time.last_ready_time = Time - 1; + } ctx->perf_time.cur_ready_interv = Time - ctx->perf_time.last_ready_time; total_Time = Time - ctx->perf_time.first_ready_time; - ctx->perf_time.ready_fps = ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] / (total_Time / 1000ULL); + ctx->perf_time.ready_fps = ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] * 1000ULL / total_Time; ctx->perf_time.last_ready_time = Time; if (vpu_show_perf_ent & VPU_READY_EVENT_PERF_MASK) vpu_dbg(LVL_WARN, "[%2d] rdy[%8ld] interv: %8ld ms fps: %ld\n", @@ -4064,8 +4097,23 @@ static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, b break; } - ctx->perf_time.last_time = Time; - return interv; + if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] > 1) + return; + + if (str != NULL) { + perf = get_dec_perf_queue(ctx); + if (!perf) { + vpu_err("get_dec_perf_queue fail\n"); + return; + } + + scnprintf(perf->str, (VPU_MAX_STEP_STRING_LENGTH - 1), str); + perf->time = Time; + + mutex_lock(&ctx->perf_lock); + list_add_tail(&perf->list, &ctx->perf_q); + mutex_unlock(&ctx->perf_lock); + } } static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 uEvent, u_int32 *event_data) @@ -4108,7 +4156,7 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 ctx->firmware_stopped = false; ctx->firmware_finished = false; ctx->req_frame_count = 0; - vpu_calculate_performance(ctx, uEvent, false); + vpu_calculate_performance(ctx, uEvent, "start done"); break; case VID_API_EVENT_STOPPED: { vpu_dbg(LVL_INFO, "receive VID_API_EVENT_STOPPED\n"); @@ -4211,7 +4259,10 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 vpu_dec_valid_ts(ctx, consumed_pic_bytesused, p_data_req); This->process_count++; } - vpu_calculate_performance(ctx, uEvent, false); + if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1) + vpu_calculate_performance(ctx, uEvent, "first decoded"); + else + vpu_calculate_performance(ctx, uEvent, NULL); ctx->frm_dec_delay--; ctx->fifo_low = false; @@ -4235,7 +4286,6 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 if (ctx->wait_res_change_done) vpu_dbg(LVL_WARN, "warning: ctx[%d] update seq info when waiting res change\n", ctx->str_index); - vpu_calculate_performance(ctx, uEvent, false); down(&ctx->q_data[V4L2_DST].drv_q_lock); respond_req_frame(ctx, &ctx->q_data[V4L2_DST], true); @@ -4279,6 +4329,7 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 send_source_change_event(ctx); pStreamPitchInfo->uFramePitch = 0x4000; ctx->b_firstseq = false; + vpu_calculate_performance(ctx, uEvent, "seq_hdr_found"); } } break; @@ -4385,7 +4436,10 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 u_int32 *FrameInfo = (u_int32 *)event_data; report_buffer_done(ctx, FrameInfo); - vpu_calculate_performance(ctx, uEvent, false); + if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1) + vpu_calculate_performance(ctx, uEvent, "first ready"); + else + vpu_calculate_performance(ctx, uEvent, NULL); } break; case VID_API_EVENT_CHUNK_DECODED: @@ -4643,6 +4697,7 @@ static void release_vpu_ctx(struct vpu_ctx *ctx) release_queue_data(ctx); free_decoder_buffer(ctx); destroy_log_info_queue(ctx); + cleanup_perf_queue(ctx); if (atomic64_read(&ctx->statistic.total_alloc_size) != 0) vpu_err("error: memory leak for vpu kalloc buffer\n"); @@ -4651,6 +4706,7 @@ static void release_vpu_ctx(struct vpu_ctx *ctx) mutex_destroy(&ctx->instance_mutex); mutex_destroy(&ctx->cmd_lock); + mutex_destroy(&ctx->perf_lock); clear_bit(ctx->str_index, &ctx->dev->instance_mask); ctx->dev->ctx[ctx->str_index] = NULL; pm_runtime_put_sync(ctx->dev->generic_dev); @@ -5516,6 +5572,13 @@ static ssize_t show_instance_perf_info(struct device *dev, struct vpu_ctx *ctx; struct vpu_statistic *statistic; int num = 0; + u_int64 start_time = 0; + u_int64 prev_time = 0; + u_int64 interv = 0; + u_int64 total = 0; + struct vpu_dec_perf_queue *perf; + struct vpu_dec_perf_queue *tmp; + int i = 0; ctx = container_of(attr, struct vpu_ctx, dev_attr_instance_perf); statistic = &ctx->statistic; @@ -5526,34 +5589,38 @@ static ssize_t show_instance_perf_info(struct device *dev, return num; num += scnprintf(buf + num, PAGE_SIZE - num, "beginning:\n"); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n", - "feed first frame", - ctx->perf_time.first_feed_interv); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n", - "send start cmd", - ctx->perf_time.start_cmd_interv); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n", - "start_done event", - ctx->perf_time.start_done_interv); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n", - "seq_hdr_found event", - ctx->perf_time.seq_hdr_found_interv); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n", - "first decoded", - ctx->perf_time.first_decoded_interv); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n", - "first ready", - ctx->perf_time.first_ready_interv); + num += scnprintf( + buf + num, PAGE_SIZE - num, + "unit: ms \t\t time-point \t interval total\n"); + + mutex_lock(&ctx->perf_lock); + list_for_each_entry_safe(perf, tmp, &ctx->perf_q, list) { + if (i == 0) { + start_time = perf->time; + prev_time = perf->time; + } + + interv = perf->time - prev_time; + total = perf->time - start_time; + num += scnprintf(buf + num, PAGE_SIZE - num, + "%40s: %8ld %8ld\n", perf->str, interv, total); + prev_time = perf->time; + + if (++i > 50) { + num += scnprintf(buf + num, PAGE_SIZE - num, + "Too many initialization steps, omitting the following\n"); + break; + } + } + mutex_unlock(&ctx->perf_lock); num += scnprintf(buf + num, PAGE_SIZE - num, "decoded:\n"); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t dec[%8ld]: %8ld ms \t average: %8ld fps\n", + num += scnprintf(buf + num, PAGE_SIZE - num, "\t count: %8ld \t fps: %8ld\n", ctx->statistic.event[VID_API_EVENT_PIC_DECODED], - ctx->perf_time.cur_decoded_interv, ctx->perf_time.decoded_fps); num += scnprintf(buf + num, PAGE_SIZE - num, "ready:\n"); - num += scnprintf(buf + num, PAGE_SIZE - num, "\t rdy[%8ld]: %8ld ms \t average: %8ld fps\n", + num += scnprintf(buf + num, PAGE_SIZE - num, "\t count: %8ld \t fps: %8ld\n", ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY], - ctx->perf_time.cur_ready_interv, ctx->perf_time.ready_fps); return num; @@ -5854,9 +5921,10 @@ static int v4l2_open(struct file *filp) mutex_init(&ctx->instance_mutex); mutex_init(&ctx->cmd_lock); + mutex_init(&ctx->perf_lock); if (kfifo_alloc(&ctx->msg_fifo, - sizeof(struct event_msg) * VID_API_MESSAGE_LIMIT, - GFP_KERNEL)) { + sizeof(struct event_msg) * VID_API_MESSAGE_LIMIT, + GFP_KERNEL)) { vpu_err("fail to alloc fifo when open\n"); ret = -ENOMEM; goto err_alloc_fifo; @@ -5880,6 +5948,7 @@ static int v4l2_open(struct file *filp) ctx->hang_status = false; ctx->first_dump_data_flag = true; INIT_LIST_HEAD(&ctx->cmd_q); + INIT_LIST_HEAD(&ctx->perf_q); ctx->tsm = createTSManager(tsm_buffer_size); if (!ctx->tsm) goto err_create_tsm; @@ -5927,12 +5996,13 @@ static int v4l2_open(struct file *filp) vpu_dbg(LVL_BIT_FLOW, "<%d> ctx[%d] open\n", current->pid, ctx->str_index); - vpu_calculate_performance(ctx, 0xff, true); + vpu_calculate_performance(ctx, 0xff, "open device"); return 0; err_firmware_load: destroy_log_info_queue(ctx); + cleanup_perf_queue(ctx); release_queue_data(ctx); if (vpu_frmcrcdump_ena) @@ -5946,6 +6016,9 @@ err_create_tsm: kfifo_free(&ctx->msg_fifo); dev->ctx[idx] = NULL; err_alloc_fifo: + mutex_destroy(&ctx->instance_mutex); + mutex_destroy(&ctx->cmd_lock); + mutex_destroy(&ctx->perf_lock); destroy_workqueue(ctx->instance_wq); err_alloc_wq: ctrls_delete_decoder(ctx); diff --git a/drivers/mxc/vpu_malone/vpu_b0.h b/drivers/mxc/vpu_malone/vpu_b0.h index 66d24f760eb0..11801d0a97a1 100644 --- a/drivers/mxc/vpu_malone/vpu_b0.h +++ b/drivers/mxc/vpu_malone/vpu_b0.h @@ -56,6 +56,7 @@ extern unsigned int vpu_dbg_level_decoder; #define MAX_MBI_NUM 18 // same with MEDIA_PLAYER_MAX_MBI_UNIT defined in firmware #define MAX_TIMEOUT_COUNT 10 #define VPU_REG_BASE 0x40000000 +#define VPU_MAX_STEP_STRING_LENGTH 40 #define V4L2_PIX_FMT_NV12_10BIT v4l2_fourcc('N', 'T', '1', '2') /* Y/CbCr 4:2:0 for 10bit */ #define INVALID_FRAME_DEPTH -1 @@ -321,13 +322,6 @@ struct vpu_dec_cmd_request { struct vpu_dec_perf_time { u_int64 open_time; - u_int64 last_time; - u_int64 first_feed_interv; - u_int64 start_cmd_interv; - u_int64 start_done_interv; - u_int64 seq_hdr_found_interv; - u_int64 first_decoded_interv; - u_int64 first_ready_interv; u_int64 first_decoded_time; u_int64 last_decoded_time; @@ -340,6 +334,12 @@ struct vpu_dec_perf_time { u_int64 ready_fps; }; +struct vpu_dec_perf_queue { + struct list_head list; + char str[VPU_MAX_STEP_STRING_LENGTH]; + u_int64 time; +}; + struct vpu_ctx { struct vpu_dev *dev; struct v4l2_fh fh; @@ -432,6 +432,9 @@ struct vpu_ctx { int res_change_occu_count; int res_change_send_count; int res_change_done_count; + + struct list_head perf_q; + struct mutex perf_lock; }; #define LVL_WARN (1 << 0) -- 2.17.1