--- title: "ksformat Usage Examples" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{ksformat Usage Examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(ksformat) ``` ksformat logo The **ksformat** package provides SAS PROC FORMAT-like functionality for R. This vignette walks through the most common use cases. ## Example 1: Basic Discrete Formatting Create a format for gender codes (auto-stored in library as "sex"): ```{r discrete} fnew( "M" = "Male", "F" = "Female", .missing = "Unknown", .other = "Other Gender", name = "sex" ) gender_codes <- c("M", "F", "M", NA, "X", "F") formatted_genders <- fput(gender_codes, "sex") data.frame( code = gender_codes, label = formatted_genders ) fprint("sex") ``` ## Example 2: Numeric Range Formatting Define formats in SAS-like text (auto-registered): ```{r ranges} fparse(text = ' VALUE age (numeric) [0, 18) = "Child" [18, 65) = "Adult" [65, HIGH] = "Senior" .missing = "Age Unknown" ; ') ages <- c(5, 15.3, 17.9, 18, 45, 64.99, 65, 85, NA) age_groups <- fputn(ages, "age") data.frame( age = ages, group = age_groups ) ``` ## Example 3: Decimal Ranges (BMI Categories) ```{r bmi} fparse(text = ' VALUE bmi (numeric) [0, 18.5) = "Underweight" [18.5, 25) = "Normal" [25, 30) = "Overweight" [30, HIGH] = "Obese" .missing = "No data" ; ') bmi_values <- c(16.2, 18.5, 22.7, 25, 29.9, 35.1, NA) bmi_labels <- fputn(bmi_values, "bmi") data.frame( bmi = bmi_values, category = bmi_labels ) ``` ## Example 4: Exclusive/Inclusive Bounds ```{r bounds} fparse(text = ' VALUE score (numeric) (0, 50] = "Low" (50, 100] = "High" .other = "Out of range" ; ') scores <- c(0, 1, 50, 51, 100, 101) score_labels <- fputn(scores, "score") data.frame( score = scores, label = score_labels ) ``` ## Example 5: Reverse Formatting with Invalue Invalues convert labels back to values. The default `target_type` is `"numeric"`: ```{r invalue} finput( "Male" = 1, "Female" = 2, name = "sex_inv" ) labels <- c("Male", "Female", "Male", "Unknown", "Female") codes <- finputn(labels, "sex_inv") data.frame( label = labels, code = codes ) ``` ## Example 6: Bidirectional Formatting `fnew_bid()` creates both a format and an invalue at once: ```{r bidirectional} status_bi <- fnew_bid( "A" = "Active", "I" = "Inactive", "P" = "Pending", name = "status" ) # Forward: code -> label status_codes <- c("A", "I", "P", "A") status_labels <- fputc(status_codes, "status") data.frame(code = status_codes, label = status_labels) # Reverse: label -> code test_labels <- c("Active", "Pending", "Inactive") test_codes <- finputc(test_labels, "status_inv") data.frame(label = test_labels, code = test_codes) ``` ## Example 7: Parse Multiple Formats from Text ```{r multiparse} fparse(text = ' // Study format definitions VALUE race (character) "W" = "White" "B" = "Black" "A" = "Asian" .missing = "Unknown" ; INVALUE race_inv "White" = 1 "Black" = 2 "Asian" = 3 ; ') flist() # character vector of names fprint() ``` ## Example 8: Export Formats Back to Text ```{r export} bmi_fmt <- format_get("bmi") cat(fexport(bmi = bmi_fmt)) ``` ## Example 9: SAS-like PUT/INPUT Functions ```{r sas-put-input} # fputn — apply numeric format by name fputn(c(5, 30, 70), "age") # fputc — apply character format by name fputc(c("M", "F"), "sex") # finputn — apply numeric invalue by name finputn(c("White", "Black"), "race_inv") ``` ## Example 10: Data Frame Formatting ```{r df-format} df <- data.frame( id = 1:6, sex = c("M", "F", "M", "F", NA, "X"), age = c(15, 25, 45, 70, 35, NA), stringsAsFactors = FALSE ) sex_f <- format_get("sex") age_f <- format_get("age") df_formatted <- fput_df( df, sex = sex_f, age = age_f, suffix = "_label" ) df_formatted ``` ## Example 11: Missing Value Handling ```{r missing} # With .missing label fput(c("M", "F", NA), "sex") # With keep_na = TRUE fput(c("M", "F", NA), sex_f, keep_na = TRUE) # is_missing() checks is_missing(NA) is_missing(NaN) is_missing("") # TRUE — empty strings are treated as missing ``` ## Example 12: Date/Time Formats (SAS-style) ### SAS Date Formats SAS date format names are auto-resolved — no pre-creation needed: ```{r date-formats} today <- Sys.Date() data.frame( format = c("DATE9.", "MMDDYY10.", "DDMMYY10.", "YYMMDD10.", "MONYY7.", "WORDDATE.", "YEAR4.", "QTR."), result = c( fputn(today, "DATE9."), fputn(today, "MMDDYY10."), fputn(today, "DDMMYY10."), fputn(today, "YYMMDD10."), fputn(today, "MONYY7."), fputn(today, "WORDDATE."), fputn(today, "YEAR4."), fputn(today, "QTR.") ) ) # Multiple dates dates <- as.Date(c("2020-01-15", "2020-06-30", "2020-12-25")) fputn(dates, "DATE9.") ``` ### R Numeric Dates (Days Since 1970-01-01) ```{r date-numeric} r_days <- as.numeric(as.Date("2025-01-01")) r_days fputn(r_days, "DATE9.") fputn(r_days, "MMDDYY10.") ``` ### Time Formats Time is represented as seconds since midnight: ```{r time-formats} seconds <- c(0, 3600, 45000, 86399) data.frame( seconds = seconds, TIME8 = fputn(seconds, "TIME8."), TIME5 = fputn(seconds, "TIME5."), HHMM = fputn(seconds, "HHMM.") ) ``` ### Datetime Formats ```{r datetime-formats} now <- Sys.time() data.frame( format = c("DATETIME20.", "DATETIME13.", "DTDATE.", "DTYYMMDD."), result = c( fputn(now, "DATETIME20."), fputn(now, "DATETIME13."), fputn(now, "DTDATE."), fputn(now, "DTYYMMDD.") ) ) # From numeric R-epoch seconds r_secs <- as.numeric(as.POSIXct("2025-06-15 14:30:00", tz = "UTC")) fputn(r_secs, "DATETIME20.") ``` ### Custom Date Formats with `fnew_date()` ```{r fnew-date} # SAS-named format fnew_date("DATE9.", name = "bday_fmt") birthdays <- as.Date(c("1990-03-25", "1985-11-03", "2000-07-14")) fput(birthdays, "bday_fmt") # Custom strftime pattern (e.g. DD.MM.YYYY) fnew_date("%d.%m.%Y", name = "ru_date", type = "date") fput(birthdays, "ru_date") # Custom pattern with missing label fnew_date("MMDDYY10.", name = "us_date", .missing = "NO DATE") mixed <- c(as.Date("2025-01-01"), NA, as.Date("2025-12-31")) fput(mixed, "us_date") fprint("bday_fmt") ``` ### Date Formats in Data Frames ```{r date-df} patients <- data.frame( id = 1:4, visit_date = as.Date(c("2025-01-10", "2025-02-15", "2025-03-20", NA)), stringsAsFactors = FALSE ) visit_fmt <- fnew_date("DATE9.", name = "visit_fmt", .missing = "NOT RECORDED") fput_df(patients, visit_date = visit_fmt) ``` ### Parse Date Formats from Text ```{r date-parse} fparse(text = ' VALUE enrldt (date) pattern = "DATE9." .missing = "Not Enrolled" ; VALUE visit_time (time) pattern = "TIME8." ; VALUE stamp (datetime) pattern = "DATETIME20." ; ') fput(as.Date("2025-03-01"), "enrldt") fput(36000, "visit_time") fput(as.POSIXct("2025-03-01 10:00:00", tz = "UTC"), "stamp") # Export back to text enrl_obj <- format_get("enrldt") cat(fexport(enrldt = enrl_obj)) fclear() ``` ## Example 13: Multilabel Formats ### Overlapping Age Categories With multilabel formats, a single value can match multiple labels: ```{r multilabel-basic} fnew( "0,5,TRUE,TRUE" = "Infant", "6,11,TRUE,TRUE" = "Child", "12,17,TRUE,TRUE" = "Adolescent", "0,17,TRUE,TRUE" = "Pediatric", "18,64,TRUE,TRUE" = "Adult", "65,Inf,TRUE,TRUE" = "Elderly", "18,Inf,TRUE,TRUE" = "Non-Pediatric", name = "age_categories", type = "numeric", multilabel = TRUE ) ages <- c(3, 14, 25, 70) # fput returns first match only fput(ages, "age_categories") # fput_all returns ALL matching labels all_labels <- fput_all(ages, "age_categories") for (i in seq_along(ages)) { cat("Age", ages[i], "->", paste(all_labels[[i]], collapse = ", "), "\n") } ``` ### Multilabel with Missing Values ```{r multilabel-missing} fnew( "0,100,TRUE,TRUE" = "Valid Score", "0,49,TRUE,TRUE" = "Below Average", "50,100,TRUE,TRUE" = "Above Average", "90,100,TRUE,TRUE" = "Excellent", .missing = "No Score", .other = "Out of Range", name = "score_ml", type = "numeric", multilabel = TRUE ) scores <- c(95, 45, NA, 150) ml_result <- fput_all(scores, "score_ml") for (i in seq_along(scores)) { cat("Score", ifelse(is.na(scores[i]), "NA", scores[i]), "->", paste(ml_result[[i]], collapse = ", "), "\n") } ``` ### Parse Multilabel from Text ```{r multilabel-parse} fparse(text = ' VALUE risk (numeric, multilabel) [0, 3] = "Low Risk" [0, 7] = "Monitored" (3, 7] = "Medium Risk" (7, 10] = "High Risk" ; ') risk_scores <- c(2, 5, 9) risk_labels <- fput_all(risk_scores, "risk") for (i in seq_along(risk_scores)) { cat("Score", risk_scores[i], "->", paste(risk_labels[[i]], collapse = " | "), "\n") } ``` ### Multilabel Export ```{r multilabel-export} risk_obj <- format_get("risk") cat(fexport(risk = risk_obj)) fprint("risk") ``` ### Practical Example: Adverse Event Severity Grading ```{r ae-grading} fnew( "1,1,TRUE,TRUE" = "Mild", "2,2,TRUE,TRUE" = "Moderate", "3,3,TRUE,TRUE" = "Severe", "4,4,TRUE,TRUE" = "Life-threatening", "5,5,TRUE,TRUE" = "Fatal", "3,5,TRUE,TRUE" = "Serious", "1,2,TRUE,TRUE" = "Non-serious", name = "ae_grade", type = "numeric", multilabel = TRUE ) grades <- c(1, 2, 3, 4, 5) ae_labels <- fput_all(grades, "ae_grade") for (i in seq_along(grades)) { cat("Grade", grades[i], ":", paste(ae_labels[[i]], collapse = " + "), "\n") } fclear() ``` ## Example 14: Case-Insensitive Matching ```{r nocase} sex_nc <- fnew( "M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex_nc", type = "character", ignore_case = TRUE ) input <- c("m", "F", "M", "f", NA) fput(input, sex_nc) # Note the [nocase] flag fprint("sex_nc") # Also works with fputc fputc("m", "sex_nc") fclear() ``` ## Example 15: Expression Labels in Formats Expression labels contain `.x1`, `.x2`, etc., which reference extra arguments passed to `fput()`. This lets you compute labels dynamically. ### Simple `sprintf` Expression ```{r expr-sprintf} stat_fmt <- fnew( "n" = "sprintf('%s', .x1)", "pct" = "sprintf('%.1f%%', .x1 * 100)", name = "stat", type = "character" ) types <- c("n", "pct", "n", "pct") values <- c(42, 0.053, 100, 0.255) fput(types, stat_fmt, values) ``` ### Two Extra Arguments (`.x1`, `.x2`) ```{r expr-twoargs} ratio_fmt <- fnew( "ratio" = "sprintf('%s/%s', .x1, .x2)", name = "ratio", type = "character" ) fput("ratio", ratio_fmt, 3, 10) fput(c("ratio", "ratio"), ratio_fmt, c(3, 7), c(10, 20)) ``` ### `ifelse` Expression ```{r expr-ifelse} sign_fmt <- fnew( "val" = "ifelse(.x1 > 0, paste0('+', .x1), as.character(.x1))", name = "sign", type = "character" ) nums <- c(5, 0, -3) fput(rep("val", 3), sign_fmt, nums) ``` ### Mixed Static and Expression Labels ```{r expr-mixed} mixed_fmt <- fnew( "header" = "HEADER", "n" = "sprintf('N=%s', .x1)", "pct" = "sprintf('%.1f%%', .x1 * 100)", name = "mixed", type = "character" ) keys <- c("header", "n", "pct", "header", "n") vals <- c(0, 42, 0.15, 0, 100) fput(keys, mixed_fmt, vals) ``` ### Expression in `.other` Fallback ```{r expr-other} known_fmt <- fnew( "ok" = "OK", .other = "sprintf('Error(%s)', .x1)", name = "err_fmt", type = "character" ) codes <- c("ok", "E01", "ok", "E99") details <- c("", "timeout", "", "overflow") fput(codes, known_fmt, details) ``` ### Scalar Recycling ```{r expr-recycle} label_fmt <- fnew( "val" = "sprintf('%s (N=%s)', .x1, .x2)", name = "recycle", type = "character" ) fput(c("val", "val"), label_fmt, c(42, 55), 100) ``` ### Statistical Table Format with Computed Labels A realistic clinical-trial example: `e()` marks labels as expressions evaluated at apply-time, `.x1` references the extra argument, and multiline `dplyr::case_when` shows complex conditional formatting. ```{r expr-stat-fnew} # Population counts used as denominators n.trt <- data.frame(pop = c("fas","pps","saf"), ntot = c(34, 30, 36)) get_n <- function(pop) { n.trt$ntot[n.trt$pop == pop] } fnew( "n_fas" = e("get_n('fas')"), "n_pps" = e("get_n('pps')"), "n_saf" = e("get_n('saf')"), "n" = "sprintf('%d', .x1)", "n_pct_fas" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('fas'))", "n_pct_pps" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('pps'))", "n_pct_saf" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('saf'))", "pct" = "dplyr::case_when( .x1>0 & .x1<0.1 ~ sprintf('%5s', ' <0.1%'), .x1>=0.1 | .x1==0 ~ sprintf(paste0('%5.', 1 ,'f%%'), .x1) )", "pval" = "dplyr::case_when( .x1>=0 & .x1<0.001 ~ sprintf('%s', '<0.001'), .x1>=0.001 & .x1<=0.999 ~ sprintf(paste0('%.', 3 ,'f'), .x1), .x1>0.999 ~ sprintf('%s', '>0.999'), .default = '--' )", name = "stat", type = "character" ) ``` The same format can be created via `fparse()`. Note that multiline expressions must be collapsed to single lines in the text block, and `(eval)` marks evaluated labels: ```{r expr-stat-fparse} fmt <- ' VALUE stat_01 (character) "n_fas" = "get_n(\'fas\')" (eval) "n_pps" = "get_n(\'pps\')" (eval) "n_saf" = "get_n(\'saf\')" (eval) "n" = "sprintf(\'%d\', .x1)" "pct" = "dplyr::case_when(.x1>0 & .x1<0.1 ~ sprintf(\'%5s\', \' <0.1%\'), .x1>=0.1 | .x1==0 ~ sprintf(paste0(\'%5.\', 1 ,\'f%%\'), .x1))" "n_pct_fas" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'fas\'))" "n_pct_pps" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'pps\'))" "n_pct_saf" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'saf\'))" "pval" = "dplyr::case_when(.x1>=0 & .x1<0.001 ~ sprintf(\'%s\', \'<0.001\'), .x1>=0.001 & .x1<=0.999 ~ sprintf(paste0(\'%.\', 3 ,\'f\'), .x1), .x1>0.999 ~ sprintf(\'%s\', \'>0.999\'), .default = \'--\')" ;' fparse(fmt) ``` Both `stat` (via `fnew`) and `stat_01` (via `fparse`) produce identical results: ```{r expr-stat-apply} df <- data.frame( types = c("n_fas", "n_pps", "n_saf", "n", "pct", "pct", "n", "pval", "pval", "n_pct_fas", "n_pct_pps", "n_pct_saf"), values = c(NA, NA, NA, 42, 0.053, 0.0008, 100, 0.255, 0.0003, 22, 22, 22) ) df$fmt <- fput(df$types, "stat", df$values) df$fmt_01 <- fput(df$types, "stat_01", df$values) print(df) fclear() ``` ## Example 16: Vectorized Format Names (SAS PUTC-style) Each element can use a different format, determined by a vector of format names: ```{r vectorized} # Dispatch format: maps type code to format name fnew("1" = "groupx", "2" = "groupy", "3" = "groupz", name = "typefmt", type = "numeric") # Per-group character formats fnew("positive" = "agree", "negative" = "disagree", "neutral" = "notsure", name = "groupx", type = "character") fnew("positive" = "accept", "negative" = "reject", "neutral" = "possible", name = "groupy", type = "character") fnew("positive" = "pass", "negative" = "fail", "neutral" = "retest", name = "groupz", type = "character") type <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) response <- c("positive", "negative", "neutral", "positive", "negative", "neutral", "positive", "negative", "neutral") # Step 1: map type -> format name respfmt <- fput(type, "typefmt") # Step 2: apply per-element format word <- fputc(response, respfmt) data.frame(type = type, response = response, respfmt = respfmt, word = word) fclear() ``` ## Example 17: Working with Dates and Formats — PUTN A SAS-style workflow where format names are looked up dynamically per observation: ```{r dates-putn} # Format that maps key codes to date format names fnew("1" = "date9.", "2" = "mmddyy10.", name = "writfmt", type = "numeric") fnew_date("date9.") fnew_date("mmddyy10.") # Input data (R date numbers = days since 1970-01-01) number <- c(12103, 10899) key <- c(1, 2) # Look up format name per observation datefmt <- fputn(key, "writfmt") # Apply per-element date format date <- fputn(number, datefmt) data.frame(number = number, key = key, datefmt = datefmt, date = date) fclear() ``` ## Example 18: Import SAS Formats from CNTLOUT CSV The `fimport()` function reads a CSV file exported from a SAS format catalogue (`PROC FORMAT ... CNTLOUT=`): ```{r cntlout-import} csv_path <- system.file("extdata", "test_cntlout.csv", package = "ksformat") ``` ```{r cntlout-use} imported <- fimport(csv_path) names(imported) flist() fprint() ``` ### Use Imported Formats ```{r cntlout-apply} # Character format (GENDER) gender_codes <- c("M", "F", NA, "X") data.frame( code = gender_codes, label = fputc(gender_codes, "GENDER") ) # Numeric format (AGEGRP) ages <- c(5, 17, 18, 45, 65, 100, NA, -1) data.frame( age = ages, group = fputn(ages, "AGEGRP") ) # Numeric format (BMICAT) bmi_values <- c(15.0, 18.5, 22.3, 25.0, 28.7, 30.0, 35.5) data.frame( bmi = bmi_values, category = fputn(bmi_values, "BMICAT") ) # Invalue (RACEIN) race_labels <- c("White", "Black", "Asian", "Other") data.frame( label = race_labels, code = finputn(race_labels, "RACEIN") ) ``` ### Apply to Data Frame ```{r cntlout-df} df <- data.frame( id = 1:5, sex = c("M", "F", "M", NA, "F"), age = c(10, 30, 70, NA, 50), stringsAsFactors = FALSE ) gender_fmt <- imported[["GENDER"]] age_fmt <- imported[["AGEGRP"]] fput_df(df, sex = gender_fmt, age = age_fmt, suffix = "_label") ``` ### Export Imported Format ```{r cntlout-export} cat(fexport(AGEGRP = age_fmt)) cat(fexport(GENDER = gender_fmt)) ``` ### Selective Import (No Auto-register) ```{r cntlout-manual} fclear() manual <- fimport(csv_path, register = FALSE) # Library should be empty flist() fprint() # Use directly from returned list fput(c("M", "F"), manual[["GENDER"]]) fclear() ``` ## Example 19: Bilingual Format Expression labels can select between languages at apply-time using an extra argument: ```{r bilingual} # Single format, language selected via .x1 extra argument sex_bi <- fnew( "M" = "ifelse(.x1 == 'en', 'Male', 'Homme')", "F" = "ifelse(.x1 == 'en', 'Female', 'Femme')", .missing = "Unknown", name = "sex_bi" ) # .x1 = language code per observation fput(c("M", "F", "M"), sex_bi, c("en", "fr", "en")) # -> "Male" "Femme" "Male" # Alternative: one format per language, selected at apply-time fnew("M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex_en") fnew("M" = "Homme", "F" = "Femme", .missing = "Inconnu", name = "sex_fr") lang <- "fr" fput(c("M", "F", NA), paste0("sex_", lang)) # -> "Homme" "Femme" "Inconnu" fclear() ``` ## Example 20: Composite Key Lookup with `fputk()` `fputk()` pastes multiple vectors into a composite key before format lookup. This is useful when a format is keyed on the combination of several columns, a common pattern in clinical data (e.g., looking up a visit date by subject + visit number). ```{r fputk-setup} # Simulate a Subject Visits (SV) domain SV <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-001", "SUBJ-001", "SUBJ-002", "SUBJ-002"), VISITNUM = c(1, 2, 3, 1, 2), SVSTDTC = c("2025-01-15", "2025-02-20", "2025-03-10", "2025-01-18", "2025-02-25"), stringsAsFactors = FALSE ) # Simulate a Questionnaires (QS) domain QS <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-001", "SUBJ-002", "SUBJ-002", "SUBJ-002"), VISITNUM = c(1, 2, 1, 2, 3), QSTESTCD = c("SCORE1", "SCORE1", "SCORE1", "SCORE1", "SCORE1"), QSSTRESN = c(85, 90, 72, 78, NA), stringsAsFactors = FALSE ) SV QS ``` ### Character lookup (returns character strings) Register a format keyed on `USUBJID|VISITNUM` with values being the visit start date (`SVSTDTC`) as character strings: ```{r fputk-register} # Create composite key -> date string mapping from SV fnew( fmap(paste(SV$USUBJID, SV$VISITNUM, sep = "|"), SV$SVSTDTC), .other = "NOT FOUND", name = "svdtc", type = "character", ignore_case = TRUE ) fprint("svdtc") ``` Now look up visit dates in the QS domain using `fputk()`: ```{r fputk-apply} QS$SVSTDTC <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtc") QS class(QS$SVSTDTC) # character fclear() ``` ### Native Date lookup (returns Date objects) Using `type = "Date"`, values are stored as native R `Date` objects and `fput()`/`fputk()` return them directly — no string conversion needed: ```{r fputk-date} # Create composite key -> Date mapping from SV fnew( fmap( paste(SV$USUBJID, SV$VISITNUM, sep = "|"), as.Date(SV$SVSTDTC, format = "%Y-%m-%d") ), .other = NA, name = "svdtn", type = "Date", ignore_case = TRUE ) fprint("svdtn") ``` ```{r fputk-date-apply} QS$SVSTDTC_DT <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtn") QS class(QS$SVSTDTC_DT) # Date # Typed NA for unmatched keys (SUBJ-002 Visit 3 not in SV) is.na(QS$SVSTDTC_DT[5]) # Date arithmetic works directly QS$SVSTDTC_DT + 7 # add 7 days fclear() ``` ## Example 21: Consistent Data-Driven Formats with `fmap()` When building formats from data (e.g., a data frame with 1000+ rows), you need a named vector mapping keys to values. By default, `fnew()` treats named vectors differently depending on the output type: - **Value types** (`Date`, `POSIXct`, `logical`): `c(key = value)` — natural direction, no reversal. - **Character / numeric**: `c(Label = "Code")` — R convention, names and values are **swapped** internally. This inconsistency is confusing for data-driven formats. The `fmap()` helper solves it: `fmap(keys, values)` works identically for **all** types. ### Clinical-data example Suppose we have a demographics dataset and need two lookup formats from the same data — one returning Date objects, one returning character strings: ```{r fmap-setup} library(ksformat) dm <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-002", "SUBJ-003"), SUBJID = c("001", "002", "003"), RFICDTC = c("2023-03-09T08:45", "2024-08-13T09:53", "2025-06-17T09:03"), stringsAsFactors = FALSE ) # Composite key for both formats keys <- paste(dm$USUBJID, dm$SUBJID, sep = "|") ``` ### Same `fmap(keys, values)` pattern for both types Both formats use the **identical** calling style — `fmap(keys, values)` where keys are input lookup values and values are output objects: ```{r fmap-date} # Date lookup fnew( fmap(keys, as.Date(dm$RFICDTC, format = "%Y-%m-%d")), .other = NA, type = "Date", ignore_case = TRUE, name = "icdtn" ) # Character lookup — same fmap(keys, values) pattern! fnew( fmap(keys, dm$RFICDTC), .other = "NOT FOUND", type = "character", ignore_case = TRUE, name = "icdtc" ) fprint("icdtn") fprint("icdtc") ``` ```{r fmap-apply} # Both return the expected results fputk("SUBJ-001", "001", format = "icdtn") class(fputk("SUBJ-001", "001", format = "icdtn")) fputk("SUBJ-001", "001", format = "icdtc") class(fputk("SUBJ-001", "001", format = "icdtc")) fclear() ``` No extra parameters needed — `fmap()` tells `fnew()` to use the natural direction for all types. ### When to use the default (reversal on) The default auto-reversal preserves the standard R convention where `c(Label = "Code")` maps `Code -> Label`. This is natural for hand-written formats: ```{r fmap-default} # These are equivalent — both map "M" -> "Male" fmt_a <- fnew(c(Male = "M", Female = "F")) fmt_b <- fnew("M" = "Male", "F" = "Female") identical(fput(c("M", "F"), fmt_a), fput(c("M", "F"), fmt_b)) fclear() ``` ### Summary | Use case | Style | Reversal | |:--------------------------|:--------------------------------------|:----------| | Data-driven (any type) | `fmap(keys, values)` | Suppressed | | Hand-written (char/num) | `c(Label = "Code")` or `"Code" = "Label"` | Auto (default) | | Value types (`Date`, etc.)| `fmap(keys, values)` or `setNames(values, keys)` | No reversal (default) | ## Example 22: Date Lookup via `fparse()` and `fputk()` Examples 20–21 built composite-key formats programmatically with `fnew()` and `fmap()`. When the mapping is **small and known in advance** (e.g., a study-specific visit schedule), you can define the same lookup entirely in text with `fparse()`. ### Character date lookup The simplest approach: store dates as character strings using a regular `character` format. ```{r fparse-date-char} fparse(text = ' VALUE svdtc (character, nocase) "SUBJ-001|1" = "2025-01-15" "SUBJ-001|2" = "2025-02-20" "SUBJ-001|3" = "2025-03-10" "SUBJ-002|1" = "2025-01-18" "SUBJ-002|2" = "2025-02-25" .other = "NOT FOUND" ; ') fprint("svdtc") ``` Apply with `fputk()` to look up visit dates from a questionnaire domain: ```{r fparse-date-char-apply} QS <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-001", "SUBJ-002", "SUBJ-002", "SUBJ-002"), VISITNUM = c(1, 2, 1, 2, 3), QSSTRESN = c(85, 90, 72, 78, NA), stringsAsFactors = FALSE ) QS$SVSTDTC <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtc") QS fclear() ``` ### Native Date lookup Use the `Date` value type with `format:` to store dates as native R `Date` objects. The `format:` parameter tells `fparse()` how to parse the date strings in the text block: ```{r fparse-date-native} fparse(text = ' VALUE svdtn (Date, format: %Y-%m-%d, nocase) "SUBJ-001|1" = "2025-01-15" "SUBJ-001|2" = "2025-02-20" "SUBJ-001|3" = "2025-03-10" "SUBJ-002|1" = "2025-01-18" "SUBJ-002|2" = "2025-02-25" ; ') fprint("svdtn") ``` Now `fputk()` returns real `Date` objects — arithmetic and comparison work directly: ```{r fparse-date-native-apply} QS$SVSTDTC_DT <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtn") QS class(QS$SVSTDTC_DT) # Date is.na(QS$SVSTDTC_DT[5]) # TRUE — no match for SUBJ-002 Visit 3 # Date arithmetic works directly QS$SVSTDTC_DT + 7 ``` ### Round-trip: export and re-import Formats created with `fparse()` can be exported back to text with `fexport()` and re-parsed — useful for version-controlled format definitions: ```{r fparse-date-roundtrip} fmt_obj <- format_get("svdtn") txt <- fexport(svdtn = fmt_obj) cat(txt) ``` ```{r fparse-date-reimport} # Re-parse the exported text fclear() fparse(text = txt) # Verify it still works fputk("SUBJ-001", 2, format = "svdtn") fclear() ``` ## Example 23: Inspecting Range Rules with `franges()` `franges()` extracts all range-based mappings from a format and returns them as a tidy `data.frame` — useful for auditing, documentation, or downstream processing. ```{r franges-basic} fparse(text = ' VALUE age (numeric) [0, 18) = "Child" [18, 65) = "Adult" [65, HIGH] = "Senior" .missing = "Unknown" ; ') franges("age") ``` You can use the result like any data frame — filter, display, or feed into further calculations: ```{r franges-filter} df <- franges("age") # Which ranges have a finite upper bound? df[is.finite(df$high), ] ``` `franges()` silently excludes discrete entries (`.missing`, `.other`, plain string keys) — only range rows appear. It returns an empty `data.frame` with the same columns when the format contains no ranges. ```{r franges-discrete} fnew("M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex") franges("sex") # 0 rows ``` ```{r franges-cleanup, include=FALSE} fclear() ``` ## Example 24: Reverse Range Lookup with `fmap_to_ranges()` When a range format stores **numeric codes** as its labels (e.g. visit windows coded as weeks), `fmap_to_ranges()` turns a vector of those codes back into the original `[low, high]` bounds — one row per input value. ```{r fmap-to-ranges} fparse(text = ' VALUE visit_ther (numeric) [LOW, 1] = 0 [ 8, 22] = 2 [22, 36] = 4 [37, 50] = 6 [51, 63] = 8 [64, 78] = 10 [79, 91] = 12 ; ') coded_weeks <- c(0, 2, 4, 6, 8, 10, 12) fmap_to_ranges(coded_weeks, "visit_ther") ``` Unmatched values produce `NA` rows, making it safe to pass arbitrary vectors: ```{r fmap-to-ranges-na} fmap_to_ranges(c(2, 99, 4), "visit_ther") ``` ```{r fmap-to-ranges-cleanup, include=FALSE} fclear() ``` ## Example 25: Date Range Bucketing `date_range` and `datetime_range` formats bucket `Date` or `POSIXct` input into character labels using ISO date/datetime interval bounds. They reuse the same range-table engine as numeric ranges, so the `findInterval()` fast path is active for sorted, disjoint buckets. ### Fiscal-year bucketing ```{r date-range-basic} fnew( "2023-01-01,2024-01-01,TRUE,FALSE" = "FY23", "2024-01-01,2025-01-01,TRUE,FALSE" = "FY24", "2025-01-01,2026-01-01,TRUE,FALSE" = "FY25", type = "date_range", name = "fiscal_year" ) dates <- as.Date(c("2023-06-15", "2024-03-01", "2024-12-31", "2025-07-04", "2022-01-01", NA)) data.frame( date = dates, fy = fput(dates, "fiscal_year") ) ``` ### Define from text with `fparse()` ```{r date-range-fparse} fparse(text = ' VALUE quarter (date_range) [2024-01-01, 2024-04-01) = "Q1-2024" [2024-04-01, 2024-07-01) = "Q2-2024" [2024-07-01, 2024-10-01) = "Q3-2024" [2024-10-01, 2025-01-01) = "Q4-2024" .other = "Outside 2024" ; ') sample_dates <- as.Date(c("2024-02-14", "2024-05-20", "2024-08-08", "2024-11-30", "2025-03-01")) data.frame( date = sample_dates, quarter = fput(sample_dates, "quarter") ) ``` ### `LOW` / `HIGH` open-ended bounds `LOW` and `HIGH` represent $-\infty$ and $+\infty$ — any date before or after a cutpoint falls in the open arm. ```{r date-range-low-high} fparse(text = ' VALUE era (date_range) [LOW, 2000-01-01) = "Pre-2000" [2000-01-01, 2010-01-01) = "2000s" [2010-01-01, 2020-01-01) = "2010s" [2020-01-01, HIGH] = "2020+" ; ') event_dates <- as.Date(c("1985-07-04", "2005-12-25", "2015-06-01", "2023-11-11")) data.frame( date = event_dates, era = fput(event_dates, "era") ) ``` ### Export and roundtrip Formats export with ISO date bounds and re-parse without loss: ```{r date-range-export} q_obj <- format_get("quarter") cat(fexport(quarter = q_obj)) ``` ```{r date-range-roundtrip} # Re-parse the exported text txt <- fexport(quarter = q_obj) fclear() fparse(text = txt) fput(as.Date(c("2024-02-14", "2024-08-08")), "quarter") ``` ### Overlapping buckets with `multilabel` and `fput_all()` ```{r date-range-multilabel} fparse(text = ' VALUE study_window (date_range, multilabel) [2024-01-01, 2024-07-01) = "First Half" [2024-04-01, 2024-10-01) = "Mid-Year" [2024-07-01, 2025-01-01) = "Second Half" ; ') checkup_dates <- as.Date(c("2024-02-15", "2024-05-20", "2024-09-01")) all_windows <- fput_all(checkup_dates, "study_window") for (i in seq_along(checkup_dates)) { cat(format(checkup_dates[i]), "->", paste(all_windows[[i]], collapse = " | "), "\n") } ``` ### Auto-detection of type When no explicit type is given, `fparse()` infers `date_range` from ISO date bounds and `datetime_range` when bounds include a time component: ```{r date-range-autodetect} fparse(text = ' VALUE auto_fy [2024-01-01, 2025-01-01) = "2024" ; VALUE auto_shift [2024-01-15 08:00, 2024-01-15 16:00) = "Day shift" ; ') cat("auto_fy type :", format_get("auto_fy")$type, "\n") cat("auto_shift type:", format_get("auto_shift")$type, "\n") ``` ### Datetime range bucketing `datetime_range` works identically to `date_range` but matches against POSIXct values. Bounds are expressed as `YYYY-MM-DD HH:MM[:SS]` strings. ```{r datetime-range} fparse(text = ' VALUE shift (datetime_range) [2024-01-15 00:00, 2024-01-15 08:00) = "Night" [2024-01-15 08:00, 2024-01-15 16:00) = "Day" [2024-01-15 16:00, 2024-01-16 00:00) = "Evening" ; ') timestamps <- as.POSIXct( c("2024-01-15 03:22:00", "2024-01-15 11:45:00", "2024-01-15 19:00:00"), tz = "UTC" ) data.frame( ts = format(timestamps, tz = "UTC"), shift = fput(timestamps, "shift") ) ``` ```{r date-range-cleanup, include=FALSE} fclear() ``` ## Example 26: Stratified Range Lookup with `fputk()` The `stratified_range` type combines a discrete stratum (such as a study arm, subject id, or any composite key) with a numeric / Date / POSIXct range. Each stratum has its own bucket boundaries, and `fputk()` dispatches to the right bucket for each row. ### Programmatic construction with `fmap_strata()` ```{r strat-num} visits <- fmap_strata( stratum = c("ARM_A", "ARM_A", "ARM_A", "ARM_B", "ARM_B"), low = c(0, 7, 28, 0, 14), high = c(7, 28, Inf, 14, Inf), label = c("Baseline", "Wk1-3", "Wk4+", "Baseline", "Wk2+"), inc_high = c(FALSE, FALSE, TRUE, FALSE, TRUE) ) fnew(visits, type = "stratified_range", ".other|ARM_A" = "A_outside", .other = "outside_window", name = "vw") df <- data.frame( arm = c("ARM_A", "ARM_A", "ARM_B", "ARM_B", "ARM_C"), day = c(3, 35, 5, 40, 10) ) df$visit <- fputk(df$arm, df$day, format = "vw") df ``` ### Text definition with `fparse()` ```{r strat-text} fparse(text = ' VALUE vw_text (stratified_range, range_subtype: numeric) "ARM_A"|[0, 7) = "Baseline" "ARM_A"|[7, 28) = "Wk1-3" "ARM_A"|[28, HIGH]= "Wk4+" "ARM_B"|[0, 14) = "Baseline" "ARM_B"|[14, HIGH]= "Wk2+" ".other|ARM_A" = "A_outside" .other = "outside_window" ; ') fputk(df$arm, df$day, format = "vw_text") ``` ### Date subtype: per-subject windows ```{r strat-date} windows <- fmap_strata( stratum = c("S001", "S001", "S002", "S002"), low = as.Date(c("2024-01-01", "2024-01-15", "2024-02-01", "2024-02-20")), high = as.Date(c("2024-01-15", "2024-02-01", "2024-02-20", "2024-03-10")), label = c("Screen", "Treat", "Screen", "Treat") ) fnew(windows, type = "stratified_range", range_subtype = "date", .other = "off-window", name = "win") subj <- c("S001", "S001", "S002", "S002", "S003") visits <- as.Date(c("2024-01-05", "2024-01-20", "2024-02-10", "2024-03-01", "2024-01-01")) data.frame( subj = subj, date = visits, phase = fputk(subj, visits, format = "win") ) ``` ### Roundtrip via `fexport()` / `fparse()` ```{r strat-roundtrip} txt <- fexport(format_get("vw")) cat(txt, "\n") fclear() fparse(text = txt) fputk(df$arm, df$day, format = "vw") ``` ```{r strat-cleanup, include=FALSE} fclear() ``` ## Example 27: Plain Range Lookup with `fmap_ranges()` For non-stratified numeric / Date ranges, `fmap_ranges()` saves you from hand-crafting canonical \code{"low,high,inc_low,inc_high"} keys. ```{r fmap-ranges-num} age_groups <- fmap_ranges( low = c(0, 18, 65), high = c(18, 65, Inf), label = c("Child", "Adult", "Senior"), inc_high = c(FALSE, FALSE, TRUE) ) fnew(age_groups, type = "numeric", name = "ag") fput(c(5, 25, 90), "ag") fclear() ``` ## Example 28: Composite Key Lookup with NA Components (`na_as_string`) When building a format from data using `fmap(paste(..., sep = "|"), values)`, base R's `paste()` converts any `NA` component to the **literal string** `"NA"`. The resulting composite key is therefore `"CAT|TEST|NA"`, not a missing value. By default, `fputk()` restores `NA_character_` wherever any component is `NA` before the lookup — so the key `"CAT|TEST|NA"` is never reached and the row falls through to `.other` / `.missing`. Setting `na_as_string = TRUE` keeps `paste()`'s literal `"NA"`, making the round-trip consistent. ### Clinical example — LB parameter derivation A common ADaM task: derive `PARAMCD` from a combination of `LBCAT`, `LBSPEC`, `LBTESTCD`, and `LBSTRESU`, where some rows have `LBSTRESU = NA` (dimensionless tests such as INR). ```{r na-str-setup} # Source lab mapping (as received from a specification) lb_map <- data.frame( LBCAT = c("BLOOD CHEMISTRY", "COAGULOGRAM", "COAGULATION PANEL", "COAGULOGRAM"), LBSPEC = c("BLOOD", "BLOOD", "BLOOD", "BLOOD"), LBTESTCD = c("ALB", "FIBRINO", "INR", "INR"), LBSTRESU = c("g/L", "g/L", NA, NA), PARAMCD = c("ALB", "FIBRINO", "INR", "INR"), stringsAsFactors = FALSE ) lb_map ``` Build the format with `fmap(paste(...), PARAMCD)`. `paste()` converts `NA` in `LBSTRESU` to `"NA"`, so the stored keys for INR rows are `"COAGULATION PANEL|BLOOD|INR|NA"` and `"COAGULOGRAM|BLOOD|INR|NA"`. ```{r na-str-build} with(lb_map, fmap(paste(LBCAT, LBSPEC, LBTESTCD, LBSTRESU, sep = "|"), PARAMCD) ) |> fnew(ignore_case = TRUE, .other = NA, type = "character", name = "lb_param") fprint("lb_param") ``` Now apply the format. With the default `na_as_string = FALSE`, the INR rows get `NA` (no match): ```{r na-str-default} lb_map$PARAMCD_default <- with(lb_map, fputk(LBCAT, LBSPEC, LBTESTCD, LBSTRESU, format = "lb_param") ) lb_map[, c("LBTESTCD", "LBSTRESU", "PARAMCD", "PARAMCD_default")] ``` With `na_as_string = TRUE`, `paste()` also converts the lookup-side `NA` to `"NA"`, so the keys match: ```{r na-str-correct} lb_map$PARAMCD_back <- with(lb_map, fputk(LBCAT, LBSPEC, LBTESTCD, LBSTRESU, format = "lb_param", na_as_string = TRUE) ) lb_map[, c("LBTESTCD", "LBSTRESU", "PARAMCD", "PARAMCD_back")] ``` ```{r na-str-cleanup, include=FALSE} fclear() ``` ### The rule of thumb > Use `na_as_string = TRUE` whenever the format was built with > `fmap(paste(...), values)` **and** any key column can contain `NA`. If the format keys were set by hand (`fnew("CAT|TEST|g/L" = "ALB", ...)`), `NA` components should still go through `.missing` — keep the default `na_as_string = FALSE`. ## Example 29: Composite Label Invalue Lookup with `finputk()` `finputk()` is the invalue-side mirror of `fputk()`: it pastes multiple columns into a composite label and reverse-looks it up in a `ks_invalue` format. The same `na_as_string` argument applies. ### Basic usage ```{r finputk-basic} # Build an INVALUE from two-column composite labels finput( fmap(paste(c("BLOOD CHEMISTRY", "COAGULOGRAM", "COAGULATION PANEL"), c("ALB", "FIBRINO", "INR"), sep = "|"), c(1L, 2L, 3L)), target_type = "integer", name = "lb_code_inv" ) # Reverse lookup: two separate columns → integer code cat_vec <- c("BLOOD CHEMISTRY", "COAGULOGRAM", "COAGULATION PANEL", "OTHER") test_vec <- c("ALB", "FIBRINO", "INR", "X") finputk(cat_vec, test_vec, invalue_name = "lb_code_inv") # BLOOD CHEMISTRY|ALB → 1, COAGULOGRAM|FIBRINO → 2, # COAGULATION PANEL|INR → 3, OTHER|X → NA (no match → missing_value) fclear() ``` ### With NA components (`na_as_string = TRUE`) When the INVALUE was built from data containing `NA` columns, use `na_as_string = TRUE` on both the build side (`fmap(paste(...), ...)`) and the lookup side (`finputk(..., na_as_string = TRUE)`). ```{r finputk-na} # INVALUE where LBSTRESU can be NA (like INR) finput( fmap( paste(lb_map$LBCAT, lb_map$LBTESTCD, lb_map$LBSTRESU, sep = "|"), seq_len(nrow(lb_map)) ), target_type = "integer", name = "lb_row_inv" ) # Reconstruct lb_map row indices — works even when LBSTRESU is NA finputk(lb_map$LBCAT, lb_map$LBTESTCD, lb_map$LBSTRESU, invalue_name = "lb_row_inv", na_as_string = TRUE) fclear() ``` The output type is always determined by the stored invalue's `target_type` (here `integer`). For character output create the invalue with `target_type = "character"` and `finputk()` returns a character vector.