From 0bab60e68f71ed9638b2e7759baf09f5572f818a Mon Sep 17 00:00:00 2001
From: Shijie Qin <shijie.qin@nxp.com>
Date: Wed, 18 Sep 2019 10:43:34 +0800
Subject: [PATCH] MLK-22606 VPU Decoder: refine decode performance info

1. change to use list record performance info for better trace each
   time-point and not limited to a fixed flow.
2. add total time from open device to each time-point.

Signed-off-by: Shijie Qin <shijie.qin@nxp.com>
Reviewed-by: Zhou Peng <eagle.zhou@nxp.com>
Reviewed-by: ming_qian <ming.qian@nxp.com>
(cherry picked from commit 563d17921b33e3b1be96414ef80535146f00edb5)
---
 drivers/mxc/vpu_malone/vpu_b0.c | 205 ++++++++++++++++++++++----------
 drivers/mxc/vpu_malone/vpu_b0.h |  17 +--
 2 files changed, 149 insertions(+), 73 deletions(-)

diff --git a/drivers/mxc/vpu_malone/vpu_b0.c b/drivers/mxc/vpu_malone/vpu_b0.c
index a0ed963b53b4..d354b68c90ab 100644
--- a/drivers/mxc/vpu_malone/vpu_b0.c
+++ b/drivers/mxc/vpu_malone/vpu_b0.c
@@ -95,7 +95,7 @@ static int send_abort_cmd(struct vpu_ctx *ctx);
 static int send_stop_cmd(struct vpu_ctx *ctx);
 static int vpu_dec_cmd_reset(struct vpu_ctx *ctx);
 static void vpu_dec_event_decode_error(struct vpu_ctx *ctx);
-static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, bool start_flag);
+static void vpu_calculate_performance(struct vpu_ctx *ctx, u_int32 uEvent, const char *str);
 
 #define CHECK_BIT(var, pos) (((var) >> (pos)) & 1)
 
@@ -978,6 +978,11 @@ static int v4l2_ioctl_s_fmt(struct file *file,
 	q_data->rect.width = pix_mp->width;
 	q_data->rect.height = pix_mp->height;
 
+	if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
+		vpu_calculate_performance(ctx, 0xff, "capture set fmt");
+	else
+		vpu_calculate_performance(ctx, 0xff, "output set fmt");
+
 	return 0;
 }
 
@@ -1008,6 +1013,11 @@ static int vpu_dec_queue_reqbufs(struct queue_data *queue,
 			queue->type ? "CAPTURE" : "OUTPUT",
 			reqbuf->count ? "request" : "free");
 
+	if (queue->type == V4L2_SRC)
+		vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "output request buffer begin" : "output free buffer begin");
+	else
+		vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "capture request buffer begin" : "capture free buffer begin");
+
 	if (queue->vb2_q_inited)
 		ret = vb2_reqbufs(&queue->vb2_q, reqbuf);
 
@@ -1015,6 +1025,10 @@ static int vpu_dec_queue_reqbufs(struct queue_data *queue,
 			ctx->str_index,
 			queue->type ? "CAPTURE" : "OUTPUT",
 			reqbuf->count ? "request" : "free");
+	if (queue->type == V4L2_SRC)
+		vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "output request buffer finish" : "output free buffer finish");
+	else
+		vpu_calculate_performance(ctx, 0xff, reqbuf->count ? "capture request buffer finish" : "capture free buffer finish");
 	up(&queue->drv_q_lock);
 
 	return ret;
@@ -2978,12 +2992,13 @@ static void v4l2_transfer_buffer_to_firmware(struct queue_data *This, struct vb2
 	int ret;
 
 	if (ctx->start_flag == true) {
-		ctx->perf_time.first_feed_interv = vpu_calculate_performance(ctx, 0xff, false);
+		vpu_calculate_performance(ctx, 0xff, "alloc stream buffer begin");
 		ret = alloc_vpu_buffer(ctx);
 		if (ret) {
 			vpu_err("alloc vpu buffer fail\n");
 			return;
 		}
+		vpu_calculate_performance(ctx, 0xff, "alloc stream buffer finish");
 
 		frame_bytes = transfer_buffer_to_firmware(ctx,
 							  vb,
@@ -2992,7 +3007,7 @@ static void v4l2_transfer_buffer_to_firmware(struct queue_data *This, struct vb2
 		if (vb->planes[0].bytesused < vb->planes[0].length)
 			vpu_dbg(LVL_INFO, "v4l2_transfer_buffer_to_firmware - set stream_feed_complete - DEBUG 1\n");
 #endif
-		ctx->perf_time.start_cmd_interv = vpu_calculate_performance(ctx, 0xff, false);
+		vpu_calculate_performance(ctx, 0xff, "send start cmd");
 		v4l2_vpu_send_cmd(ctx, ctx->str_index, VID_API_CMD_START, 0, NULL);
 		p_data_req = list_first_entry(&This->drv_q,
 				typeof(*p_data_req), list);
@@ -3856,6 +3871,7 @@ static void check_wait_res_changed(struct vpu_ctx *ctx)
 				ctx->res_change_occu_count,
 				ctx->res_change_send_count,
 				ctx->res_change_done_count);
+				vpu_calculate_performance(ctx, 0xff, "first provide buffer");
 			break;
 		}
 	}
@@ -3990,51 +4006,66 @@ static bool check_is_need_reset_after_abort(struct vpu_ctx *ctx)
 	return false;
 }
 
-static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, bool start_flag)
+static struct vpu_dec_perf_queue *get_dec_perf_queue(struct vpu_ctx *ctx)
+{
+	struct vpu_dec_perf_queue *perf;
+
+	perf = vzalloc(sizeof(*perf));
+	if (!perf)
+		return NULL;
+
+	atomic64_add(sizeof(*perf), &ctx->statistic.total_alloc_size);
+
+	return perf;
+}
+
+static void put_dec_perf_queue(struct vpu_ctx *ctx, struct vpu_dec_perf_queue *perf)
+{
+	if (!perf)
+		return;
+
+	vfree(perf);
+	atomic64_sub(sizeof(*perf), &ctx->statistic.total_alloc_size);
+}
+
+static void cleanup_perf_queue(struct vpu_ctx *ctx)
+{
+	struct vpu_dec_perf_queue *perf;
+	struct vpu_dec_perf_queue *tmp;
+
+	mutex_lock(&ctx->perf_lock);
+	list_for_each_entry_safe(perf, tmp, &ctx->perf_q, list) {
+		list_del_init(&perf->list);
+		put_dec_perf_queue(ctx, perf);
+	}
+	mutex_unlock(&ctx->perf_lock);
+}
+
+static void vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, const char *str)
 {
 	u_int64 Time;
-	u_int64 interv = 0;
 	u_int64 total_Time;
 	struct timeval tv;
+	struct vpu_dec_perf_queue *perf;
 
 	if (!vpu_show_perf_ena)
-		return 0;
+		return;
 	if (!(vpu_show_perf_idx & (1<<ctx->str_index)))
-		return 0;
+		return;
 
 	do_gettimeofday(&tv);
 	Time = ((tv.tv_sec * 1000000ULL) + tv.tv_usec) / 1000ULL;
 
-	if (start_flag) {
-		ctx->perf_time.open_time = Time;
-		ctx->perf_time.last_time = Time;
-		return interv;
-	}
-
-	interv = Time - ctx->perf_time.last_time;
-
 	switch (uEvent) {
-	case VID_API_EVENT_START_DONE:
-		ctx->perf_time.start_done_interv = interv;
-		break;
-	case VID_API_EVENT_SEQ_HDR_FOUND:
-		if (ctx->b_firstseq) {
-			ctx->perf_time.seq_hdr_found_interv = interv;
-
-			ctx->perf_time.first_decoded_time = Time;
-			ctx->perf_time.last_decoded_time = Time;
-			ctx->perf_time.first_ready_time = Time;
-			ctx->perf_time.last_ready_time = Time;
-		}
-
-		break;
 	case VID_API_EVENT_PIC_DECODED:
-		if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1)
-			ctx->perf_time.first_decoded_interv = Time - ctx->perf_time.last_decoded_time;
+		if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1) {
+			ctx->perf_time.first_decoded_time = Time - 1;
+			ctx->perf_time.last_decoded_time = Time - 1;
+		}
 
 		ctx->perf_time.cur_decoded_interv = Time - ctx->perf_time.last_decoded_time;
 		total_Time = Time - ctx->perf_time.first_decoded_time;
-		ctx->perf_time.decoded_fps = ctx->statistic.event[VID_API_EVENT_PIC_DECODED] / (total_Time / 1000ULL);
+		ctx->perf_time.decoded_fps = ctx->statistic.event[VID_API_EVENT_PIC_DECODED] * 1000ULL / total_Time;
 		ctx->perf_time.last_decoded_time = Time;
 		if (vpu_show_perf_ent & VPU_DECODED_EVENT_PERF_MASK)
 			vpu_dbg(LVL_WARN, "[%2d] dec[%8ld]  interv: %8ld ms  fps: %ld\n",
@@ -4045,12 +4076,14 @@ static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, b
 
 		break;
 	case VID_API_EVENT_FRAME_BUFF_RDY:
-		if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1)
-			ctx->perf_time.first_ready_interv = Time - ctx->perf_time.last_ready_time;
+		if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1) {
+			ctx->perf_time.first_ready_time = Time - 1;
+			ctx->perf_time.last_ready_time = Time - 1;
+		}
 
 		ctx->perf_time.cur_ready_interv = Time - ctx->perf_time.last_ready_time;
 		total_Time = Time - ctx->perf_time.first_ready_time;
-		ctx->perf_time.ready_fps = ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] / (total_Time / 1000ULL);
+		ctx->perf_time.ready_fps = ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] * 1000ULL / total_Time;
 		ctx->perf_time.last_ready_time = Time;
 		if (vpu_show_perf_ent & VPU_READY_EVENT_PERF_MASK)
 			vpu_dbg(LVL_WARN, "[%2d] rdy[%8ld]  interv: %8ld ms  fps: %ld\n",
@@ -4064,8 +4097,23 @@ static u_int64 vpu_calculate_performance(struct vpu_ctx *ctx,  u_int32 uEvent, b
 		break;
 	}
 
-	ctx->perf_time.last_time = Time;
-	return interv;
+	if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] > 1)
+		return;
+
+	if (str != NULL) {
+		perf = get_dec_perf_queue(ctx);
+		if (!perf) {
+			vpu_err("get_dec_perf_queue fail\n");
+			return;
+		}
+
+		scnprintf(perf->str, (VPU_MAX_STEP_STRING_LENGTH - 1), str);
+		perf->time = Time;
+
+		mutex_lock(&ctx->perf_lock);
+		list_add_tail(&perf->list, &ctx->perf_q);
+		mutex_unlock(&ctx->perf_lock);
+	}
 }
 
 static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32 uEvent, u_int32 *event_data)
@@ -4108,7 +4156,7 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
 		ctx->firmware_stopped = false;
 		ctx->firmware_finished = false;
 		ctx->req_frame_count = 0;
-		vpu_calculate_performance(ctx, uEvent, false);
+		vpu_calculate_performance(ctx, uEvent, "start done");
 		break;
 	case VID_API_EVENT_STOPPED: {
 		vpu_dbg(LVL_INFO, "receive VID_API_EVENT_STOPPED\n");
@@ -4211,7 +4259,10 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
 		vpu_dec_valid_ts(ctx, consumed_pic_bytesused, p_data_req);
 		This->process_count++;
 		}
-		vpu_calculate_performance(ctx, uEvent, false);
+		if (ctx->statistic.event[VID_API_EVENT_PIC_DECODED] == 1)
+			vpu_calculate_performance(ctx, uEvent, "first decoded");
+		else
+			vpu_calculate_performance(ctx, uEvent, NULL);
 
 		ctx->frm_dec_delay--;
 		ctx->fifo_low = false;
@@ -4235,7 +4286,6 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
 		if (ctx->wait_res_change_done)
 			vpu_dbg(LVL_WARN, "warning: ctx[%d] update seq info when waiting res change\n",
 				ctx->str_index);
-		vpu_calculate_performance(ctx, uEvent, false);
 
 		down(&ctx->q_data[V4L2_DST].drv_q_lock);
 		respond_req_frame(ctx, &ctx->q_data[V4L2_DST], true);
@@ -4279,6 +4329,7 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
 			send_source_change_event(ctx);
 			pStreamPitchInfo->uFramePitch = 0x4000;
 			ctx->b_firstseq = false;
+			vpu_calculate_performance(ctx, uEvent, "seq_hdr_found");
 		}
 		}
 		break;
@@ -4385,7 +4436,10 @@ static void vpu_api_event_handler(struct vpu_ctx *ctx, u_int32 uStrIdx, u_int32
 		u_int32 *FrameInfo = (u_int32 *)event_data;
 
 		report_buffer_done(ctx, FrameInfo);
-		vpu_calculate_performance(ctx, uEvent, false);
+		if (ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY] == 1)
+			vpu_calculate_performance(ctx, uEvent, "first ready");
+		else
+			vpu_calculate_performance(ctx, uEvent, NULL);
 	}
 		break;
 	case VID_API_EVENT_CHUNK_DECODED:
@@ -4643,6 +4697,7 @@ static void release_vpu_ctx(struct vpu_ctx *ctx)
 	release_queue_data(ctx);
 	free_decoder_buffer(ctx);
 	destroy_log_info_queue(ctx);
+	cleanup_perf_queue(ctx);
 
 	if (atomic64_read(&ctx->statistic.total_alloc_size) != 0)
 		vpu_err("error: memory leak for vpu kalloc buffer\n");
@@ -4651,6 +4706,7 @@ static void release_vpu_ctx(struct vpu_ctx *ctx)
 
 	mutex_destroy(&ctx->instance_mutex);
 	mutex_destroy(&ctx->cmd_lock);
+	mutex_destroy(&ctx->perf_lock);
 	clear_bit(ctx->str_index, &ctx->dev->instance_mask);
 	ctx->dev->ctx[ctx->str_index] = NULL;
 	pm_runtime_put_sync(ctx->dev->generic_dev);
@@ -5516,6 +5572,13 @@ static ssize_t show_instance_perf_info(struct device *dev,
 	struct vpu_ctx *ctx;
 	struct vpu_statistic *statistic;
 	int num = 0;
+	u_int64 start_time = 0;
+	u_int64 prev_time = 0;
+	u_int64 interv = 0;
+	u_int64 total = 0;
+	struct vpu_dec_perf_queue *perf;
+	struct vpu_dec_perf_queue *tmp;
+	int i = 0;
 
 	ctx = container_of(attr, struct vpu_ctx, dev_attr_instance_perf);
 	statistic = &ctx->statistic;
@@ -5526,34 +5589,38 @@ static ssize_t show_instance_perf_info(struct device *dev,
 		return num;
 
 	num += scnprintf(buf + num, PAGE_SIZE - num, "beginning:\n");
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-			"feed first frame",
-			ctx->perf_time.first_feed_interv);
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-			"send start cmd",
-			ctx->perf_time.start_cmd_interv);
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-			"start_done event",
-			ctx->perf_time.start_done_interv);
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-			"seq_hdr_found event",
-			ctx->perf_time.seq_hdr_found_interv);
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-			"first decoded",
-			ctx->perf_time.first_decoded_interv);
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t%40s: %8ld ms\n",
-			"first ready",
-			ctx->perf_time.first_ready_interv);
+	num += scnprintf(
+		buf + num, PAGE_SIZE - num,
+		"unit: ms \t\t time-point \t   interval    total\n");
+
+	mutex_lock(&ctx->perf_lock);
+	list_for_each_entry_safe(perf, tmp, &ctx->perf_q, list) {
+		if (i == 0) {
+			start_time = perf->time;
+			prev_time = perf->time;
+		}
+
+		interv = perf->time - prev_time;
+		total = perf->time - start_time;
+		num += scnprintf(buf + num, PAGE_SIZE - num,
+				 "%40s: %8ld %8ld\n", perf->str, interv, total);
+		prev_time = perf->time;
+
+		if (++i > 50) {
+			num += scnprintf(buf + num, PAGE_SIZE - num,
+				"Too many initialization steps, omitting the following\n");
+			break;
+		}
+	}
+	mutex_unlock(&ctx->perf_lock);
 
 	num += scnprintf(buf + num, PAGE_SIZE - num, "decoded:\n");
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t dec[%8ld]: %8ld ms \t average: %8ld fps\n",
+	num += scnprintf(buf + num, PAGE_SIZE - num, "\t count: %8ld \t fps: %8ld\n",
 			ctx->statistic.event[VID_API_EVENT_PIC_DECODED],
-			ctx->perf_time.cur_decoded_interv,
 			ctx->perf_time.decoded_fps);
 	num += scnprintf(buf + num, PAGE_SIZE - num, "ready:\n");
-	num += scnprintf(buf + num, PAGE_SIZE - num, "\t rdy[%8ld]: %8ld ms \t average: %8ld fps\n",
+	num += scnprintf(buf + num, PAGE_SIZE - num, "\t count: %8ld \t fps: %8ld\n",
 			ctx->statistic.event[VID_API_EVENT_FRAME_BUFF_RDY],
-			ctx->perf_time.cur_ready_interv,
 			ctx->perf_time.ready_fps);
 
 	return num;
@@ -5854,9 +5921,10 @@ static int v4l2_open(struct file *filp)
 
 	mutex_init(&ctx->instance_mutex);
 	mutex_init(&ctx->cmd_lock);
+	mutex_init(&ctx->perf_lock);
 	if (kfifo_alloc(&ctx->msg_fifo,
-				sizeof(struct event_msg) * VID_API_MESSAGE_LIMIT,
-				GFP_KERNEL)) {
+			sizeof(struct event_msg) * VID_API_MESSAGE_LIMIT,
+			GFP_KERNEL)) {
 		vpu_err("fail to alloc fifo when open\n");
 		ret = -ENOMEM;
 		goto err_alloc_fifo;
@@ -5880,6 +5948,7 @@ static int v4l2_open(struct file *filp)
 	ctx->hang_status = false;
 	ctx->first_dump_data_flag = true;
 	INIT_LIST_HEAD(&ctx->cmd_q);
+	INIT_LIST_HEAD(&ctx->perf_q);
 	ctx->tsm = createTSManager(tsm_buffer_size);
 	if (!ctx->tsm)
 		goto err_create_tsm;
@@ -5927,12 +5996,13 @@ static int v4l2_open(struct file *filp)
 
 	vpu_dbg(LVL_BIT_FLOW, "<%d> ctx[%d] open\n",
 			current->pid, ctx->str_index);
-	vpu_calculate_performance(ctx, 0xff, true);
+	vpu_calculate_performance(ctx, 0xff, "open device");
 
 	return 0;
 
 err_firmware_load:
 	destroy_log_info_queue(ctx);
+	cleanup_perf_queue(ctx);
 	release_queue_data(ctx);
 
 	if (vpu_frmcrcdump_ena)
@@ -5946,6 +6016,9 @@ err_create_tsm:
 	kfifo_free(&ctx->msg_fifo);
 	dev->ctx[idx] = NULL;
 err_alloc_fifo:
+	mutex_destroy(&ctx->instance_mutex);
+	mutex_destroy(&ctx->cmd_lock);
+	mutex_destroy(&ctx->perf_lock);
 	destroy_workqueue(ctx->instance_wq);
 err_alloc_wq:
 	ctrls_delete_decoder(ctx);
diff --git a/drivers/mxc/vpu_malone/vpu_b0.h b/drivers/mxc/vpu_malone/vpu_b0.h
index 66d24f760eb0..11801d0a97a1 100644
--- a/drivers/mxc/vpu_malone/vpu_b0.h
+++ b/drivers/mxc/vpu_malone/vpu_b0.h
@@ -56,6 +56,7 @@ extern unsigned int vpu_dbg_level_decoder;
 #define MAX_MBI_NUM 18 // same with MEDIA_PLAYER_MAX_MBI_UNIT defined in firmware
 #define MAX_TIMEOUT_COUNT 10
 #define VPU_REG_BASE 0x40000000
+#define VPU_MAX_STEP_STRING_LENGTH 40
 
 #define V4L2_PIX_FMT_NV12_10BIT    v4l2_fourcc('N', 'T', '1', '2') /*  Y/CbCr 4:2:0 for 10bit  */
 #define INVALID_FRAME_DEPTH -1
@@ -321,13 +322,6 @@ struct vpu_dec_cmd_request {
 
 struct vpu_dec_perf_time {
 	u_int64 open_time;
-	u_int64 last_time;
-	u_int64 first_feed_interv;
-	u_int64 start_cmd_interv;
-	u_int64 start_done_interv;
-	u_int64 seq_hdr_found_interv;
-	u_int64 first_decoded_interv;
-	u_int64 first_ready_interv;
 
 	u_int64 first_decoded_time;
 	u_int64 last_decoded_time;
@@ -340,6 +334,12 @@ struct vpu_dec_perf_time {
 	u_int64 ready_fps;
 };
 
+struct vpu_dec_perf_queue {
+	struct list_head list;
+	char str[VPU_MAX_STEP_STRING_LENGTH];
+	u_int64 time;
+};
+
 struct vpu_ctx {
 	struct vpu_dev *dev;
 	struct v4l2_fh fh;
@@ -432,6 +432,9 @@ struct vpu_ctx {
 	int res_change_occu_count;
 	int res_change_send_count;
 	int res_change_done_count;
+
+	struct list_head perf_q;
+	struct mutex perf_lock;
 };
 
 #define LVL_WARN		(1 << 0)
-- 
2.17.1