#' Find peaks of waves in raw PULSE data
#' @description
#' `heartbeatr-package` Find peaks of waves in raw PULSE data
#'
#' @param t time
#' @param y val
#'
#' @return
#' A numeric vector indicating the indexes where peaks were detected.
#'
#' @export
find_peaks <- function(t, y) {
	N <- length(y)
	L <- N %/% 2  # half-length of the array

	# de-trend data
	# S <- seq(1:N)
	# f <- fitted(lm(y ~ S))
	# y <- y - f

	# create a Boolean matrix with L rows and N columns, initialized with TRUE
	MAT <- matrix(rep(TRUE, L * N), nrow = L)

	# loop to compare to right and left neighbors
	for (k in 1:L) {
		MAT[k, 1:(N - k)] <- MAT[k, 1:(N - k)] & (y[1:(N - k)] > y[(k + 1):N])
		MAT[k, (k + 1):N] <- MAT[k, (k + 1):N] & (y[(k + 1):N] > y[1:(N - k)])
	}

	# find scale with most maxima
	G <- rowSums(MAT)
	G <- G * (L:1) # normalize to adjust for new edge regions
	l_scale <- which.max(G)

	# find peaks that persist on all scales up to l
	MAT <- MAT[1:l_scale, , drop = FALSE]
	pks_logical <- apply(MAT, 2, all)

	# sometimes the algorithm picks up peaks at the beginning or end of the
	# split window, and those must be rejected
	if (N > 20) {
		l <- 10
		pks_logical[1:l] <- FALSE
		pks_logical[(N-l+1):N] <- FALSE
	}

	pks <- which(pks_logical)
	pks
}

#' Determine the heart beat frequency in one PULSE channel
#'
#' @description
#' Take data from one PULSE channel and identify the heartbeat wave peaks using an algorithm that searches for maxima across multiple scales.
#'
#' @param split_window_one_channel a tibble with PULSE data for only one channel with columns `$time` and `$val`
#'
#' @return
#' A one-row tibble with 8 columns:
#' * `time`,  time at the center of split_window_one_channel$time
#' * `t_pks`, time stamps of each wave peak identified
#' * `hz`,    heartbeat rate estimate (in Hz)
#' * `n`,     number of wave peaks identified
#' * `sd`,    standard deviation of the intervals between wave peaks
#' * `ci`,    confidence interval (hz ± ci)
#'
#' @details
#' function builds upon code from https://github.com/ig248/pyampd
#'
#' @section Standard Deviation:
#' The `sd` computed refers to the spread of the intervals between each peak identified. It is a measure of the quality of the raw data and the ability of the algorithm to identify a real heart beat. The lower the `sd`, the more regular are the intervals between peaks, and the more likely that the algorithm did find a real signal. Conversely, higher `sd`s indicate that the peaks are found at irregular intervals, and is an indication of poor quality data.
# Because the frequency of the heartbeat in the data influences the magnitude of the `sd`, the absolute values for the intervals between each peak are first normalized (divided by the mean of those intervals, thus becoming a proportion). This ensures that `sd` values from different split windows can be directly compared, but should caution before reading too much into those values.
#' In detail, `sd` is computed by: 1) taking the timestamps for each peak identified \[`t_pks`\], 2) computing the intervals between each pair of consecutive peaks \[`as.numeric(diff(t_pks))`\], and 3) computing `sd` \[`sd(intervals)`\].
#'
#' @section BPM:
#' To convert to Beats Per Minute, simply multiply `hz` and `ci` by 60.
#'
#' @export
#'
#' @seealso
#'  * [pulse_find_peaks_all_channels()] runs [pulse_find_peaks_one_channel()] on all PULSE channels
#'  * [pulse_read()], [pulse_split()], [pulse_optimize()], [pulse_heart()], [pulse_doublecheck()] and [pulse_choose_keep()] are the functions needed for the complete PULSE processing workflow
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once
#'
#' @examples
#' ## Begin prepare data ----
#' pulse_data_sub <- pulse_data
#' pulse_data_sub$data <- pulse_data_sub$data[,1:5]
#' pulse_data_split <- pulse_split(pulse_data_sub)
#' pulse_data_split <- pulse_optimize(pulse_data_split, multi = pulse_data$multi)
#' split_window <- pulse_data_split$data[[1]]
#' split_window_one_channel <- split_window[,1:2]
#' colnames(split_window_one_channel) <- c("time", "val")
#' # End prepare data ----
#'
#' ## Determine heartbeat rates in one channel in one time window
#' pulse_find_peaks_one_channel(split_window_one_channel)
pulse_find_peaks_one_channel <- function(split_window_one_channel) {

	t <- split_window_one_channel$time
	y <- split_window_one_channel$val

	has.peak <- split_window_one_channel %>%
		colnames() %>%
		"=="("peak") %>%
		any()

	if (has.peak) {
		pks <- which(split_window_one_channel$peak)
	} else {
		pks <- find_peaks(t, y)
		if (length(pks) < 3) {
			split_window_one_channel <- pulse_smooth1(split_window_one_channel)
			t <- split_window_one_channel$time
			y <- split_window_one_channel$val
			pks <- find_peaks(t, y)
		}
	}

	# compute stats
	t_pks     <- t[pks]
	intervals <- t_pks %>% diff() %>% as.numeric()
	hz        <- mean(1 / intervals)
	hz_sd     <- stats::sd(intervals)
	hz_ci     <- hz_sd * 1.96

	# return
	tibble::tibble(
		time   = mean(t),
		t_pks  = list(t_pks),
		hz     = round(hz,     3),
		n      = length(pks),
		sd     = round(hz_sd,  3),
		ci     = round(hz_ci,  3)
	)
}

#' Determine the heartbeat rate in all channels of a PULSE split window
#'
#' @description
#' Take data from PULSE data window and run `pulse_find_peaks_one_channel` in all channels.
#'
#' @inheritParams pulse_interpolate
#'
#' @return
#' A tibble with up to 10 rows (one for each channel) and 7 columns:
#' * `id`, PULSE channel IDs
#' * `time`, time at the center of split_window_one_channel$time
#' * `data`, a list of tibbles with raw PULSE data for each combination of channel and window, with columns `time`, `val` and `peak` (`TRUE` when data points correspond to wave peaks)
#' * `hz`,  heartbeat rate estimate (in Hz)
#' * `n`,   number of wave peaks identified
#' * `sd`,  standard deviation of the intervals between wave peaks (normalized)
#' * `ci`,  confidence interval (hz ± ci)

#' @export
#'
#' @seealso
#'  * [pulse_find_peaks_all_channels()] runs [pulse_find_peaks_one_channel()] on all PULSE channels
#'  * [pulse_read()], [pulse_split()], [pulse_optimize()], [pulse_heart()] and [pulse_choose_keep()] are the functions needed for the complete PULSE processing workflow
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once
#'
#' @section BPM:
#' To convert to Beats Per Minute, simply multiply `hz` and `ci` by 60.
#'
#' @examples
#' ## Begin prepare data ----
#' pulse_data_sub <- pulse_data
#' pulse_data_sub$data <- pulse_data_sub$data[,1:5]
#' pulse_data_split <- pulse_split(pulse_data_sub)
#' pulse_data_split <- pulse_optimize(pulse_data_split, multi = pulse_data$multi)
#' split_window <- pulse_data_split$data[[1]]
#' ## End prepare data ----
#'
#' # Determine heartbeat rates in all channels in one time window
#' pulse_find_peaks_all_channels(split_window)
pulse_find_peaks_all_channels <- function(split_window) {
	stopifnot(is.pulse.tbl(split_window))

	# rearrange data
	long_split_window <- split_window %>%
		tidyr::pivot_longer(
			cols      = -time,
			names_to  = "id",
			values_to = "val"
		) %>%
		tidyr::nest(data = c("time", "val"))

	# determine the heart rate in all available channels
	long_split_window <- dplyr::bind_cols(
		long_split_window,
		purrr::map(long_split_window$data, pulse_find_peaks_one_channel) %>%
			purrr::list_rbind()
	)

	# signal peaks in data
	long_split_window <- long_split_window %>%
		dplyr::mutate(data = purrr::map2(data, t_pks, ~dplyr::mutate(.x, peak = time %in% .y))) %>%
		dplyr::select(-t_pks) %>%
		dplyr::relocate(id, time)

	# return
	return(long_split_window)
}


#' (`STEP 4`) Determine the heartbeat rate in all channels of a split PULSE object
#'
#' @description
#' * `step 1` -- [pulse_read()]
#' * `step 2` -- [pulse_split()]
#' * `step 3` -- [pulse_optimize()]
#' * **`-->>` step 4 -- [pulse_heart()] `<<--`**
#' * `step 5` -- [pulse_doublecheck()]
#' * `step 6` -- [pulse_choose_keep()]
#'
#' For each combination of channel and time window, determine the heartbeat rate automatically.
#'
#' `pulse_heart()` takes the output from a call to `pulse_optimize()` (or `pulse_split()` if optimization is skipped, but that is highly discouraged) and employs an algorithm optimized for the identification of wave peaks in noisy data to determine the heart beat frequency in all channels of the PULSE dataset.
#'
#' @inheritParams pulse_read
#' @inheritParams pulse_split
#' @inheritParams pulse_optimize
#' @param show_progress logical, defaults to `FALSE`. If set to `TRUE`, progress messages will be provided.
#'
#' @return
#' A tibble with nrows = (number of channels) * (number of windows in `pulse_data_split`) and 10 columns:
#' * `i`, index of each time window's order
#' * `smoothed`, whether the data has been smoothed with [pulse_smooth()]
#' * `id`, PULSE channel IDs
#' * `time`, time at the center of each time window
#' * `data`, a list of tibbles with raw PULSE data for each combination of channel and window, with columns `time`, `val` and `peak` (`TRUE` in rows corresponding to wave peaks)
#' * `hz`,  heartbeat rate estimate (in Hz)
#' * `n`,   number of wave peaks identified
#' * `sd`,  standard deviation of the intervals between wave peaks
#' * `ci`,  confidence interval (hz ± ci)
#' * `keep`, whether `n` and `sd` are within the target thresholds
#'
#' @section BPM:
#' To convert to Beats Per Minute, simply multiply `hz` and `ci` by 60.
#'
#' @export
#'
#' @seealso
#'  * [pulse_find_peaks_all_channels()] runs [pulse_find_peaks_one_channel()] on all PULSE channels
#'  * [pulse_read()], [pulse_split()], [pulse_optimize()], [pulse_doublecheck()] and [pulse_choose_keep()] are the other functions needed for the complete PULSE processing workflow
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once
#'  * [pulse_summarise()] can be used to reduce the number of data points returned
#'
#' @examples
#' ## Begin prepare data ----
#' pulse_data_sub <- pulse_data
#' pulse_data_sub$data <- pulse_data_sub$data[,1:3]
#' pulse_data_split <- pulse_split(pulse_data_sub)
#' pulse_data_split <- pulse_optimize(pulse_data_split, multi = pulse_data$multi)
#' ## End prepare data ----
#'
#' # Determine heartbeat rates in all channels in all time window
#' pulse_heart(pulse_data_split)
pulse_heart <- function(pulse_data_split, msg = TRUE, show_progress = FALSE) {
	## CHECKS INITIATED ## ------------------- ##
	stopifnot(all(purrr::map_lgl(pulse_data_split$data, is.pulse.tbl)))
	stopifnot(is.logical(msg))
	## CHECKS COMPLETED ## ------------------- ##

	heart_rates <- purrr::map(
		pulse_data_split$data,
		pulse_find_peaks_all_channels,
		.progress = if (show_progress) {
			list(clear = TRUE, format = "computing heart rates {cli::pb_bar} {cli::pb_percent} [{cli::pb_elapsed}] ETA: {cli::pb_eta}")
		} else {
			FALSE
		}
	)

	# tidy
	heart_rates <- pulse_data_split %>%
		dplyr::select(-data) %>%
		tibble::add_column(heart = heart_rates) %>%
		tidyr::unnest(cols = "heart")

	# return
	heart_rates
}

#' (`STEP 5`) Fix heart rate frequencies double the real value
#'
#' @description
#' * `step 1` -- [pulse_read()]
#' * `step 2` -- [pulse_split()]
#' * `step 3` -- [pulse_optimize()]
#' * `step 4` -- [pulse_heart()]
#' * **`-->>` step 5 -- [pulse_doublecheck()] `<<--`**
#' * `step 6` -- [pulse_choose_keep()]
#'
#' @description
#' Flag (and correct) data points where it is likely that the heart rate frequency computed corresponds to double the actual heart rate frequency due to the algorithm having identified two peaks per heart beat
#'
#' @param heart_rates the output from [pulse_heart()]
#' @param flag numerical, decimal from 0 to 1, defaults to `0.9`; values of `d_r` above this number will be flagged as instances where the algorithm resulted in double the real heart rate. Values above `1`are meaningless (zero data points will be flagged), and values below `~0.66` are too lax (many data points will be flagged when they shouldn't).
#' @param correct logical, defaults to `TRUE`; if `FALSE`, data points with `hz` values likely double the real value are flagged **BUT NOT CORRECTED**. If `TRUE`, `hz` (as well as `data`, `n`, `sd` and `ci`) are corrected accordingly. Note that the correction is not reversible!
#'
#' @return A tibble similar to the one used as input, now augmented with two new columns: `d_r` and `d_f`. Values of `d_r` (ratio) close to `1` are indicative that the value for `hz` determined by the algorithm should be halved. If `correct` was set to `TRUE`, `d_f` flags data points where `hz` **HAS BEEN HALVED**. If `correct` was set to `FALSE`, then `d_f` flags data points where `hz` **SHOULD BE HALVED**.
#'
#' @section Heart beat frequency estimation:
#' For many invertebrates, the circulatory system includes more than one contractile chamber, meaning that there are two consecutive movements that may or may not be detected by the PULSE system's IR sensors. Furthermore, when the sensor is attached to the shell of the animal, it remains at a fixed position even as the soft body tissues move below that. As a result, even if one takes explicit care to position the sensor in such a way that only one wave peak is detected for each heart beat cycle, at some point the animal may move and the sensor's field of view may come to encompass both contractile chambers. When that occurs, the shape of the signal detected will include two peaks per heart beat cycle, the relative sizes of which may vary considerably. To be clear, there's nothing wrong with such a signal. However, it creates a problem: the algorithm detects peaks, and therefore, if two peaks are detected for each heart beat, the resulting estimate for the heart beat frequency will show a value twice as much as the real value.
#'
#' @section Detection method:
#' While it is often easy to discern if a PULSE data point has two peaks per heart beat upon visual inspection, to do so automatically is much harder. The strategy employed here relies on analyzing the intervals between consecutive peaks and looking for a regular alternation between longer and shorter intervals, as well as higher and lower peak signal values. If intervals are consistently shorter, then longer, then shorter again, we can assume that the distribution of interval times is bimodal, and that there are always two peaks more closer together separated by a longer interval - a classical two-peaks-per-heart-beat situation. For example, let's say 24 peaks are detected. We can compute the time span between each peak, which will correspond to 23 intervals (measured in seconds). Then, intervals can be classified as being longer or shorter than the preceding interval. Lastly, we divide the number of longer-than-previous intervals by the total number of intervals, deriving the ratio of switching intervals. Similarly, if peak signal values are consistently higher, then lower, then higher again, we can also assume that two different heart movements belonging to the same heartbeat are represented in the data, and a similar algorithm can be followed. The closer the ratio is to `1`, the more certain we are that we are facing a situation where the algorithm will result in a heart beat frequency twice the real value. Because the choice of a threshold to flag data points as needing to be halved or not is arbritary, both the flagging and the ratio are provided in the output, thus enabling a reassessment of the resulting classification.
#'
#' @export
#'
#' @seealso
#'  * [pulse_heart()] generates the tibble that is used as input.
#'  * [pulse_read()], [pulse_split()], [pulse_optimize()], [pulse_heart()] and [pulse_choose_keep()] are the other functions needed for the complete PULSE processing workflow
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once, including the identification of possible heart rate doublings
#'
#' @examples
#' ## Begin prepare data ----
#' pulse_data_sub <- pulse_data
#' pulse_data_sub$data <- pulse_data_sub$data[,1:3]
#' pulse_data_split <- pulse_split(pulse_data_sub)
#' pulse_data_split <- pulse_optimize(pulse_data_split, multi = pulse_data$multi)
#' heart_rates <- pulse_heart(pulse_data_split)
#' ## End prepare data ----
#'
#' # Correct heartbeat frequency estimates
#' pulse_doublecheck(heart_rates)
pulse_doublecheck <- function(heart_rates, flag = 0.9, correct = TRUE) {
	hr_data <- heart_rates$data

	# TIME
	# find the time spans between consecutive peaks
	t_diffs <- purrr::map(hr_data, ~.x %>%
													dplyr::filter(peak) %>%
													dplyr::pull(time) %>%
													diff() %>% # time diff between consecutive peaks
													as.numeric()
	)

	# find positive changes
	t_positives <- purrr::map(t_diffs, ~.x %>%
															diff(lag = 1) %>% # time diff variation (pos [widening] v neg [contracting])
															">"(0) %>%
															which() # indexes of positive changes in time diff (i.e., when diff is widening)
	)

	# identify intervals with spans changing cyclically
	t_alternates <- purrr::map(t_positives, ~if (length(.x) < 3) {
		FALSE
	} else {
		.x %>%
			diff() %>% # length of gap between positive diff changes
			as.numeric() %>%
			"=="(2) # equality to 2 indicates cyclical alternation between pos and neg
	}
	)

	# find the ratio of intervals alternating cyclically
	t_ratios <- purrr::map_dbl(t_alternates, ~sum(.x) / length(.x))

	# AMPLITUDE
	# find the signal change between consecutive peaks
	a_diffs <- purrr::map(hr_data, ~.x %>%
													dplyr::filter(peak) %>%
													dplyr::pull(val) %>%
													diff() %>% # signal change between consecutive peaks
													as.numeric()
	)

	# find positive changes
	a_positives <- purrr::map(a_diffs, ~.x %>%
															diff(lag = 1) %>% # ignal change variation (pos [widening] v neg [contracting])
															">"(0) %>%
															which() # indexes of positive changes in signal change (i.e., when diff is widening)
	)

	# identify intervals with spans changing cyclically
	a_alternates <- purrr::map(a_positives, ~if (length(.x) < 5) {
		FALSE
	} else {
		.x %>%
			diff() %>% # length of gap between positive signal changes
			as.numeric() %>%
			"=="(2) # equality to 2 indicates cyclical alternation between pos and neg
	}
	)

	# find the ratio of intervals alternating cyclically
	a_ratios <- purrr::map_dbl(a_alternates, ~sum(.x) / length(.x))

	# tidy
	hr <- heart_rates %>%
		dplyr::mutate(
			d_r = ifelse(t_ratios > a_ratios, t_ratios, a_ratios),
			d_f = d_r > flag
		)

	# apply correction
	if (correct & any(hr$d_f)) {
		ii <- which(hr$d_f)
		hr_true <- hr %>%
			dplyr::slice(ii) %>%
			pulse_halve()
		hr <- hr %>%
			dplyr::slice(-ii) %>%
			dplyr::bind_rows(hr_true) %>%
			dplyr::arrange(i, time, id)
	}

	# return
	hr
}

#' Halves heart beat frequencies computed by `pulse_heart`
#'
#' @description
#' Halves the heart beat frequency computed by `pulse_heart` when double peaks have been detected by `pulse_correct`. Note that the correction cannot be reverted (if just testing, store as a different variable). The associated stats are recalculated. This function is used by `pulse_correct`, it is not immediately usable as standalone.
#'
#' @param hr a tibble as the one used as input to [pulse_doublecheck()], but with the additional column `d_f`, which flags rows where heart beat frequencies need to be halved. All rows supplied are halved, so input should be a filtered version of the full dataset.
#'
#' @return A tibble with as many rows as the one provided as input, but with `data`, `hz`, `n`, `sd`, and `ci` adjusted accordingly.
#'
#' @export
#'
#' @seealso
#'  * [pulse_doublecheck()] is the function within the [`heartbeatr-package`] that uses `pulse_halve`
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once, including the identification and correction of possible heart rate doublings
pulse_halve <- function(hr) {
	stopifnot(tibble::is_tibble(hr))
	stopifnot(any(colnames(hr) == "d_f"))

	new_hr <- purrr::map_dfr(
		hr$data,
		~{
			x <- .x
			old_peaks <- which(x$peak)
			if (length(old_peaks)) {
				new_peaks <- old_peaks[seq(1, length(old_peaks), by = 2)]
				x$peak <- FALSE
				x$peak[new_peaks] <- TRUE
			}
			new_dat <- pulse_find_peaks_one_channel(x)
			new_dat$data <- list(x)
			new_dat
		}
	)

	hr$data <- new_hr$data
	hr$hz   <- new_hr$hz
	hr$n    <- new_hr$n
	hr$sd   <- new_hr$sd
	hr$ci   <- new_hr$ci
	hr$d_f  <- FALSE

	hr
}

#' (`STEP 6`) Choose the best heart beat frequency estimate from among two estimates derived from raw and smoothed data
#'
#' @description
#' * `step 1` -- [pulse_read()]
#' * `step 2` -- [pulse_split()]
#' * `step 3` -- [pulse_optimize()]
#' * `step 4` -- [pulse_heart()]
#' * `step 5` -- [pulse_doublecheck()]
#' * **`-->>` step 6 -- [pulse_choose_keep()] `<<--`**
#'
#' @description
#' When running [pulse_optimize()] or [PULSE()] with `raw_v_smoothed = TRUE`, two estimates are generated for each data point, and `pulse_choose_keep` is used to automatically select the best one (based on N and SD levels set by the user). NOTE: if supplied with input data generated using `raw_v_smoothed = FALSE`, `pulse_choose_keep` outputs the same data, unchanged.
#'
#' @inheritParams pulse_doublecheck
#' @param lim_n numeric, defaults to `3`; minimum number of peaks detected in each time window for it to be considered a "keep".
#' @param lim_sd numeric, defaults to `0.75`; maximum value for the sd of the time intervals between each peak detected for it to be considered a "keep"
#'
#' @return
#' A tibble with the same structure as the input, but now with only one estimate for each combination of `id` and `time` (the one that was deemed better).
#'
#' @export
#'
#' @seealso
#'  *  [pulse_read()], [pulse_split()], [pulse_optimize()], [pulse_heart()] and [pulse_doublecheck()] are the other functions needed for the complete PULSE processing workflow
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once
#'
#' @examples
#' ## Begin prepare data ----
#' pulse_data_sub <- pulse_data
#' pulse_data_sub$data <- pulse_data_sub$data[,1:2]
#' pulse_data_split <- pulse_split(pulse_data_sub)
#' pulse_data_split <- pulse_optimize(pulse_data_split, multi = pulse_data$multi)
#' heart_rates <- pulse_heart(pulse_data_split)
#' ## End prepare data ----
#'
#' nrow(heart_rates)
#' heart_rates <- pulse_choose_keep(heart_rates)
#' nrow(heart_rates) # halved
pulse_choose_keep <- function(heart_rates, lim_n = 3, lim_sd = 0.75) {
	# keep the best estimate
	heart_rates <- heart_rates %>%
		tibble::rowid_to_column("row") %>%
		dplyr::mutate(SD = ifelse(is.na(sd), 100, sd)) %>%
		dplyr::arrange(i, id, time, SD, smoothed) %>%
		dplyr::group_by(i, id, time) %>%
		dplyr::summarise(row = row[1], .groups = "drop") %>%
		dplyr::pull(row) %>%
		dplyr::slice(heart_rates, .)

	# keep (check if n and sd meet the provided thresholds)
	heart_rates <- heart_rates %>%
		dplyr::mutate(
			keep = (n > lim_n) & (sd < lim_sd),
			keep = dplyr::if_else(is.na(keep), FALSE, keep)
		)

	heart_rates
}
