lgspline {lgspline}R Documentation

Fit Lagrangian Multiplier Smoothing Splines

Description

A comprehensive software package for fitting a variant of smoothing splines as a constrained optimization problem, avoiding the need to algebraically disentangle a spline basis after fitting, and allowing for interpretable interactions and non-spline effects to be included.

lgspline fits piecewise polynomial regression splines constrained to be smooth where they meet, penalized by the squared, integrated, second-derivative of the estimated function with respect to predictors.

The method of Lagrangian multipliers is used to enforce the following:

The coefficients are penalized by an analytical form of the traditional cubic smoothing spline penalty, as well as tunable modifications that allow for unique penalization of multiple predictors and partitions.

This package supports model fitting for multiple spline and non-spline effects, GLM families, Weibull accelerated failure time (AFT) models, correlation structures, quadratic programming constraints, and extensive customization for user-defined models.

In addition, parallel processing capabilities and comprehensive tools for visualization, frequentist, and Bayesian inference are provided.

Usage

lgspline(predictors = NULL, y = NULL, formula = NULL, response = NULL,
                standardize_response = TRUE, standardize_predictors_for_knots = TRUE,
                standardize_expansions_for_fitting = TRUE, family = gaussian(),
                glm_weight_function = function(mu, y, order_indices, family, dispersion,
                                               observation_weights, ...) {
                  if(any(!is.null(observation_weights))){
                    family$variance(mu) * observation_weights
                  } else {
                    family$variance(mu)
                  }
                }, shur_correction_function = function(X, y, B, dispersion, order_list, K,
                                                       family, observation_weights, ...) {
                  lapply(1:(K+1), function(k) 0)
                }, need_dispersion_for_estimation = FALSE,
                dispersion_function = function(mu, y, order_indices, family,
                                               observation_weights, ...) { 1 },
                K = NULL, custom_knots = NULL, cluster_on_indicators = FALSE,
                make_partition_list = NULL, previously_tuned_penalties = NULL,
                smoothing_spline_penalty = NULL, opt = TRUE, use_custom_bfgs = TRUE,
                delta = NULL, tol = 10*sqrt(.Machine$double.eps),
                invsoftplus_initial_wiggle = c(-25, 20, -15, -10, -5),
                invsoftplus_initial_flat = c(-14, -7), wiggle_penalty = 2e-07,
                flat_ridge_penalty = 0.5, unique_penalty_per_partition = TRUE,
                unique_penalty_per_predictor = TRUE, meta_penalty = 1e-08,
                predictor_penalties = NULL, partition_penalties = NULL,
                include_quadratic_terms = TRUE, include_cubic_terms = TRUE,
                include_quartic_terms = NULL, include_2way_interactions = TRUE,
                include_3way_interactions = TRUE, include_quadratic_interactions = FALSE,
                offset = c(), just_linear_with_interactions = NULL,
                just_linear_without_interactions = NULL, exclude_interactions_for = NULL,
                exclude_these_expansions = NULL, custom_basis_fxn = NULL,
                include_constrain_fitted = TRUE, include_constrain_first_deriv = TRUE,
                include_constrain_second_deriv = TRUE,
                include_constrain_interactions = TRUE, cl = NULL, chunk_size = NULL,
                parallel_eigen = TRUE, parallel_trace = FALSE, parallel_aga = FALSE,
                parallel_matmult = FALSE, parallel_unconstrained = TRUE,
                parallel_find_neighbors = FALSE, parallel_penalty = FALSE,
                parallel_make_constraint = FALSE,
                unconstrained_fit_fxn = unconstrained_fit_default,
                keep_weighted_Lambda = FALSE, iterate_tune = TRUE,
                iterate_final_fit = TRUE, blockfit = FALSE,
                qp_score_function = function(X, y, mu, order_list, dispersion,
                                             VhalfInv, observation_weights, ...) {
                  if(!is.null(observation_weights)) {
                    crossprod(X, cbind((y - mu)*observation_weights))
                  } else {
                    crossprod(X, cbind(y - mu))
                  }
                }, qp_observations = NULL, qp_Amat = NULL, qp_bvec = NULL, qp_meq = 0,
                qp_positive_derivative = FALSE, qp_negative_derivative = FALSE,
                qp_monotonic_increase = FALSE, qp_monotonic_decrease = FALSE,
                qp_range_upper = NULL, qp_range_lower = NULL, qp_Amat_fxn = NULL,
                qp_bvec_fxn = NULL, qp_meq_fxn = NULL, constraint_values = cbind(),
                constraint_vectors = cbind(), return_G = TRUE, return_Ghalf = TRUE,
                return_U = TRUE, estimate_dispersion = TRUE, unbias_dispersion = NULL,
                return_varcovmat = TRUE, custom_penalty_mat = NULL,
                cluster_args = c(custom_centers = NA, nstart = 10),
                dummy_dividor = 1.2345672152894e-22,
                dummy_adder = 2.234567210529e-18, verbose = FALSE,
                verbose_tune = FALSE, expansions_only = FALSE,
                observation_weights = NULL, do_not_cluster_on_these = c(),
                neighbor_tolerance = 1 + 1e-08, null_constraint = NULL,
                critical_value = qnorm(1 - 0.05/2), data = NULL, weights = NULL,
                no_intercept = FALSE, correlation_id = NULL, spacetime = NULL,
                correlation_structure = NULL, VhalfInv = NULL, Vhalf = NULL,
                VhalfInv_fxn = NULL, Vhalf_fxn = NULL, VhalfInv_par_init = c(),
                REML_grad = NULL, custom_VhalfInv_loss = NULL, VhalfInv_logdet = NULL,
                include_warnings = TRUE, ...)

Arguments

predictors

Default: NULL. Numeric matrix or data frame of predictor variables. Supports direct matrix input or formula interface when used with 'data' argument. Must contain numeric predictors, with categorical variables pre-converted to numeric indicators.

y

Default: NULL. Numeric response variable vector representing the target/outcome/dependent variable etc. to be modeled.

formula

Default: NULL. Optional statistical formula for model specification, serving as an alternative to direct matrix input. Supports standard R formula syntax with special 'spl()' function for defining spline terms.

response

Default: NULL. Alternative name for response variable, providing compatibility with different naming conventions. Takes precedence only if 'y' is not supplied.

standardize_response

Default: TRUE. Logical indicator controlling whether the response variable should be centered by mean and scaled by standard deviation before model fitting. When TRUE, tends to improve numerical stability. Only offered for identity link functions (when family$link == 'identity')

standardize_predictors_for_knots

Default: TRUE. Logical flag determining whether predictor variables should be standardized before knot placement. Ensures consistent knot selection across different predictor scales, and that no one predictor dominates in terms of influence on knot placement. For all expansions (x), standardization corresponds to dividing by the difference in 69 and 31st percentiles = x / (quantile(x, 0.69) - quantile(x, 0.31)).

standardize_expansions_for_fitting

Default: TRUE. Logical switch to standardize polynomial basis expansions during model fitting. Provides computational stability during penalty tuning without affecting statistical inference, as design matrices, variance-covariance matrices, and coefficient estimates are systematically backtransformed after fitting to account for the standardization. If TRUE, \textbf{U} and \textbf{G} will remain on the transformed scale, and B_raw as returned will correspond to the coefficients fitted on the expansion-standardized scale.

family

Default: gaussian(). Generalized linear model (GLM) distribution family specifying the error distribution and link function for model fitting. Defaults to Gaussian distribution with identity link. Supports custom family specifications, including user-defined link functions and optional custom tuning loss criteria. Minimally requires 1) family name (family) 2) link name (link) 3) linkfun (link function) 4) linkinv (link function inverse) 5) variance (mean variance relationship function, \text{Var}(Y|\mu)).

glm_weight_function

Default: function that returns family$variance(mu) * observation_weights if weights exist, family$variance(mu) otherwise. Codes the mean-variance relationship of a GLM or GLM-like model, the diagonal \textbf{W} matrix of \textbf{X}^T\textbf{W}\textbf{X} that appears in the information. This can be replaced with a user-specified function. It is used for updating \textbf{G} = (\textbf{X}^{T}\textbf{W}\textbf{X} + \textbf{L})^{-1} after obtaining constrained estimates of coefficients. This is not used for fitting unconstrained models, but for iterating between updates of \textbf{U}, \textbf{G}, and beta coefficients afterwards.

shur_correction_function

Default: function that returns list of zeros. Advanced function for computing Schur complements \textbf{S} to add to \textbf{G} to properly account for uncertainty in dispersion or other nuisance parameter estimation. The effective information becomes \textbf{G}^* = (\textbf{G}^{-1} + \textbf{S})^{-1}.

need_dispersion_for_estimation

Default: FALSE. Logical indicator specifying whether a dispersion parameter is required for coefficient estimation. This is not needed for canonical regular exponential family models, but is often needed otherwise (such as fitting Weibull AFT models).

dispersion_function

Default: function that returns 1. Custom function for estimating the dispersion parameter. Unless need_dispersion_for_estimation is TRUE, this will not affect coefficient estimates.

K

Default: NULL. Integer specifying the number of knot locations for spline partitions. This can intuitively be considered the total number of partitions - 1.

custom_knots

Default: NULL. Optional matrix providing user-specified knot locations in 1-D.

cluster_on_indicators

Default: FALSE. Logical flag determining whether indicator variables should be used for clustering knot locations.

make_partition_list

Default: NULL. Optional list allowing direct specification of custom partition assignments. The intent is that the make_partition_list returned by one model can be supplied here to keep the same knot locations for another.

previously_tuned_penalties

Default: NULL. Optional list of pre-computed penalty components from previous model fits.

smoothing_spline_penalty

Default: NULL. Optional custom smoothing spline penalty matrix for fine-tuned complexity control.

opt

Default: TRUE. Logical switch controlling whether model penalties should be automatically optimized via generalized cross-validation. Turn this off if previously_tuned_penalties are supplied AND desired, otherwise, the previously_tuned_penalties will be ignored.

use_custom_bfgs

Default: TRUE. Logical indicator selecting between a native implementation of damped-BFGS optimization method with analytical gradients or base R's BFGS implementation with finite-difference approximation of gradients.

delta

Default: NULL. Numeric pseudocount used for stabilizing optimization in non-identity link function scenarios.

tol

Default: 10*sqrt(.Machine$double.eps). Numeric convergence tolerance controlling the precision of optimization algorithms.

invsoftplus_initial_wiggle

Default: c(-25, 20, -15, -10, -5). Numeric vector of initial grid points for wiggle penalty optimization, specified on the inverse-softplus (\text{softplus}(x) = \log(1+e^x)) scale.

invsoftplus_initial_flat

Default: c(-7, 0). Numeric vector of initial grid points for ridge penalty optimization, specified on the inverse-softplus (\text{softplus}(x) = \log(1+e^x)) scale.

wiggle_penalty

Default: 2e-7. Numeric penalty controlling the integrated squared second derivative, governing function smoothness. Applied to both smoothing spline penalty (alone) and is multiplied by flat_ridge_penalty for penalizing linear terms.

flat_ridge_penalty

Default: 0.5. Numeric flat ridge penalty for additional regularization on only intercepts and linear terms (won't affect interactions or quadratic/cubic/quartic terms by default). If custom_penalty_mat is supplied, the penalty will be for the custom penalty matrix instead. This penalty is multiplied with wiggle_penalty to obtain the total ridge penalty - hence, by default, the ridge penalization on linear terms is half of the magnitude of non-linear terms.

unique_penalty_per_partition

Default: TRUE. Logical flag allowing the magnitude of the smoothing spline penalty to differ across partition.

unique_penalty_per_predictor

Default: TRUE. Logical flag allowing the magnitude of the smoothing spline penalty to differ between predictors.

meta_penalty

Default: 1e-8. Numeric "meta-penalty" applied to predictor and partition penalties during tuning. The minimization of GCV is modified to be a penalized minimization problem, with penalty 0.5 \times \text{meta\_penalty} \times (\sum \log(\text{penalty}))^2, such that penalties are pulled towards 1 on the absolute scale and thus, their multiplicative effect towards 0.

predictor_penalties

Default: NULL. Optional list of custom penalties specified per predictor.

partition_penalties

Default: NULL. Optional list of custom penalties specified per partition.

include_quadratic_terms

Default: TRUE. Logical switch to include squared predictor terms in basis expansions.

include_cubic_terms

Default: TRUE. Logical switch to include cubic predictor terms in basis expansions.

include_quartic_terms

Default: NULL. Logical switch to include quartic predictor terms in basis expansions. This is highly recommended for fitting models with multiple predictors to avoid over-specified constraints. When NULL (by default), will internally set to FALSE if only one predictor present, and TRUE otherwise.

include_2way_interactions

Default: TRUE. Logical switch to include linear two-way interactions between predictors.

include_3way_interactions

Default: TRUE. Logical switch to include three-way interactions between predictors.

include_quadratic_interactions

Default: FALSE. Logical switch to include linear-quadratic interaction terms.

offset

Default: Empty vector. When non-missing, this is a vector of column indices/names to include as offsets. lgspline will automatically introduce constraints such that the coefficient for offset terms are 1.

just_linear_with_interactions

Default: NULL. Integer vector specifying columns to retain linear terms with interactions.

just_linear_without_interactions

Default: NULL. Integer vector specifying columns to retain only linear terms without interactions.

exclude_interactions_for

Default: NULL. Integer vector indicating columns to exclude from all interaction terms.

exclude_these_expansions

Default: NULL. Character vector specifying basis expansions to be excluded from the model. These must be named columns of the data, or in the form "_1_", "_2_", "_1_x_2_", "_2_^2" etc. where "1" and "2" indicate column indices of predictor matrix input.

custom_basis_fxn

Default: NULL. Optional user-defined function for generating custom basis expansions. See get_polynomial_expansions.

include_constrain_fitted

Default: TRUE. Logical switch to constrain fitted values at knot points.

include_constrain_first_deriv

Default: TRUE. Logical switch to constrain first derivatives at knot points.

include_constrain_second_deriv

Default: TRUE. Logical switch to constrain second derivatives at knot points.

include_constrain_interactions

Default: TRUE. Logical switch to constrain interaction terms at knot points.

cl

Default: NULL. Parallel processing cluster object for distributed computation (use parallel::makeCluster()).

chunk_size

Default: NULL. Integer specifying custom fixed chunk size for parallel processing.

parallel_eigen

Default: TRUE. Logical flag to enable parallel processing for eigenvalue decomposition computations.

parallel_trace

Default: FALSE. Logical flag to enable parallel processing for trace computation.

parallel_aga

Default: FALSE. Logical flag to enable parallel processing for specific matrix operations involving G and A.

parallel_matmult

Default: FALSE. Logical flag to enable parallel processing for block-diagonal matrix multiplication.

parallel_unconstrained

Default: TRUE. Logical flag to enable parallel processing for unconstrained maximum likelihood estimation.

parallel_find_neighbors

Default: FALSE. Logical flag to enable parallel processing for neighbor identification (which partitions are neighbors).

parallel_penalty

Default: FALSE. Logical flag to enable parallel processing for penalty matrix construction.

parallel_make_constraint

Default: FALSE. Logical flag to enable parallel processing for constraint matrix generation.

unconstrained_fit_fxn

Default: unconstrained_fit_default. Custom function for fitting unconstrained models per partition.

keep_weighted_Lambda

Default: FALSE. Logical flag to retain generalized linear model weights in penalty constraints using Tikhonov parameterization. It is advised to turn this to TRUE when fitting non-canonical GLMs. The default unconstrained_fit_default by default assumes canonical GLMs for setting up estimating equations; this is not valid with non-canonical GLMs. With keep_weighted_Lambda = TRUE, the Tikhonov parameterization binds \boldsymbol{\Lambda}^{1/2}, the square-root penalty matrix, to the design matrix \textbf{X}_k for each partition k, and family$linkinv(0) to the response vector \textbf{y}_k for each partition before finding unconstrained estimates using base R's glm.fit function. The potential issue is that the weights of the information matrix will appear in the penalty, such that the effective penalty is \boldsymbol{\Lambda}_\text{eff} = \textbf{L}^{1/2}\textbf{W}\textbf{L}^{1/2} rather than just \textbf{L}^{1/2}\textbf{L}^{1/2}. If FALSE, this approach will only be used to supply initial values to a native implementation of damped Newton-Rapshon for fitting GLM models (see damped_newton_r and unconstrained_fit_default). For Gamma with log-link, this is fortunately a non-issue since the mean-variance relationship is essentially stabilized, so keep_weighted_Lambda = TRUE is strongly advised.

iterate_tune

Default: TRUE. Logical switch to use iterative optimization during penalty tuning. If FALSE, \textbf{G} and \textbf{U} are constructed from unconstrained \boldsymbol{\beta} estimates when tuning.

iterate_final_fit

Default: TRUE. Logical switch to use iterative optimization for final model fitting. If FALSE, \textbf{G} and \textbf{U} are constructed from unconstrained \boldsymbol{\beta} estimates when fitting the final model after tuning.

blockfit

Default: FALSE. Logical switch to abandon per-partition fitting for non-spline effects without interactions, collapse all matrices into block-diagonal single-matrix form, and fit agnostic to partition. This would be more efficient for many non-spline effects without interactions and relatively few spline effects or non-spline effects with interactions. Ignored if length(just_linear_without_interactions) = 0 after processing formulas and input.

qp_score_function

Default: \textbf{X}^{T}(\textbf{y} - \text{E}[\textbf{y}]), where \text{E}[\textbf{y}] = \boldsymbol{\mu}. A function returning the score of the log-likelihood for optimization (excluding penalization/priors involving \boldsymbol{\Lambda}), which is needed for the formulation of quadratic programming problems, when blockfit = TRUE, and correlation-structure fitting for GLMs, all relying on solve.QP. Accepts arguments "X, y, mu, order_list, dispersion, VhalfInv, observation_weights, ..." in order. As shown in the examples below, a gamma log-link model requires \textbf{X}^{T}\textbf{W}(\textbf{y} - \text{E}[\textbf{y}]) instead, with \textbf{W} a diagonal matrix of \text{E}[\textbf{y}]^2 (Note: This example might be incorrect; check the specific score equation for Gamma log-link). This argument is not needed when fitting non-canonical GLMs without quadratic programming constraints or correlation structures, situations for which keep_weighted_Lambda=TRUE is sufficient.

qp_observations

Default: NULL. Numeric vector of observations to apply constraints to for monotonic and range quadratic programming constraints. Useful for saving computational resources.

qp_Amat

Default: NULL. Constraint matrix for quadratic programming formulation. The Amat argument of solve.QP.

qp_bvec

Default: NULL. Constraint vector for quadratic programming formulation. The bvec argument of solve.QP.

qp_meq

Default: 0. Number of equality constraints in quadratic programming setup. The meq argument of solve.QP.

qp_positive_derivative, qp_monotonic_increase

Default: FALSE. Logical flags to constrain the function to have positive first derivatives/be monotonically increasing using quadratic programming with respect to the order (ascending rows) of the input data set.

qp_negative_derivative, qp_monotonic_decrease

Default: FALSE. Logical flags to constrain the function to have negative first derivatives/be monotonically decreasing using quadratic programming with respect to the order (ascending rows) of the input data set.

qp_range_upper

Default: NULL. Numeric upper bound for constrained fitted values using quadratic programming.

qp_range_lower

Default: NULL. Numeric lower bound for constrained fitted values using quadratic programming.

qp_Amat_fxn

Default: NULL. Custom function for generating Amat matrix in quadratic programming.

qp_bvec_fxn

Default: NULL. Custom function for generating bvec vector in quadratic programming.

qp_meq_fxn

Default: NULL. Custom function for determining meq equality constraints in quadratic programming.

constraint_values

Default: cbind(). Matrix of constraint values for sum constraints. The constraint enforces \textbf{C}^T(\boldsymbol{\beta} - \textbf{c}) = \boldsymbol{0} in addition to smoothing constraints, where \textbf{C} = constraint_vectors and \textbf{c} = constraint_values.

constraint_vectors

Default: cbind(). Matrix of vectors for sum constraints. The constraint enforces \textbf{C}^T(\boldsymbol{\beta} - \textbf{c}) = \boldsymbol{0} in addition to smoothing constraints, where \textbf{C} = constraint_vectors and \textbf{c} = constraint_values.

return_G

Default: TRUE. Logical switch to return the unscaled variance-covariance matrix without smoothing constraints (\textbf{G}).

return_Ghalf

Default: TRUE. Logical switch to return the matrix square root of the unscaled variance-covariance matrix without smoothing constraints (\textbf{G}^{1/2}).

return_U

Default: TRUE. Logical switch to return the constraint projection matrix \textbf{U}.

estimate_dispersion

Default: TRUE. Logical flag to estimate the dispersion parameter after fitting.

unbias_dispersion

Default NULL. Logical switch to multiply final dispersion estimates by N/(N-\text{trace}(\textbf{X}\textbf{U}\textbf{G}\textbf{X}^{T})), which in the case of Gaussian-distributed errors with identity link function, provides unbiased estimates of variance. When NULL (by default), gets set to TRUE for Gaussian + identity link and FALSE otherwise.

return_varcovmat

Default: TRUE. Logical switch to return the variance-covariance matrix of the estimated coefficients. This is needed for performing Wald inference.

custom_penalty_mat

Default: NULL. Optional p \times p custom penalty matrix for individual partitions to replace the default ridge penalty applied to linear-and-intercept terms only. This can be interpreted as proportional to the prior correlation matrix of coefficients for non-spline effects, and will appear in the penalty matrix for all partitions. It is recommended to first run the function using expansions_only = TRUE so you have an idea of where the expansions appear in each partition, what "p" is, and you can carefully customize your penalty matrix after.

cluster_args

Default: c(custom_centers = NA, nstart = 10). Named vector of arguments controlling clustering procedures. If the first argument is not NA, this will be treated as custom cluster centers and all other arguments ignored. Otherwise, default base R k-means clustering will be used with all other arguments supplied to kmeans (for example, by default, the "nstart" argument as provided). Custom centers must be a K \times q matrix with one column for each predictor in order of their appearance in input predictor/data, and one row for each center.

dummy_dividor

Default: 0.00000000000000000000012345672152894. Small numeric constant to prevent division by zero in computational routines.

dummy_adder

Default: 0.000000000000000002234567210529. Small numeric constant to prevent division by zero in computational routines.

verbose

Default: FALSE. Logical flag to print general progress messages during model fitting (does not include during tuning).

verbose_tune

Default: FALSE. Logical flag to print detailed progress messages during penalty tuning specifically.

expansions_only

Default: FALSE. Logical switch to return only basis expansions without full model fitting. Useful for setting up custom constraints and penalties.

observation_weights

Default: NULL. Numeric vector of observation-specific weights for generalized least squares estimation.

do_not_cluster_on_these

Default: c(). Vector specifying predictor columns to exclude from clustering procedures, in addition to the non-spline effects by default.

neighbor_tolerance

Default: 1 + 1e-8. Numeric tolerance for determining neighboring partitions using k-means clustering. Greater values means more partitions are likely to be considered neighbors. Intended for internal use only (modify at your own risk!).

null_constraint

Default: NULL. Alternative parameterization of constraint values.

critical_value

Default: qnorm(1-0.05/2). Numeric critical value value used for constructing Wald confidence intervals of the form \text{estimate} \pm \text{critical\_value} \times (\text{standard error}).

data

Default: NULL. Optional data frame providing context for formula-based model specification.

weights

Default: NULL. Alternative name for observation weights, maintained for interface compatibility.

no_intercept

Default: FALSE. Logical flag to remove intercept, constraining it to 0. The function automatically constructs constraint_vectors and constraint_values to achieve this. Calling formulas with a "0+" in it like y ~ 0 + . will set this option to TRUE.

correlation_id, spacetime

Default: NULL. N-length vector and N-row matrix of cluster (or subject, group etc.) ids and longitudinal/spatial variables respectively, whereby observations within each grouping of correlation_id are correlated with respect to the variables submitted to spacetime.

correlation_structure

Default: NULL. Native implementations of popular variance-covariance structures. Offers options for "exchangeable", "spatial-exponential", "squared-exponential", "ar(1)", "spherical", "gaussian-cosine", "gamma-cosine", and "matern", along with their aliases. The eponymous correlation structure is fit along with coefficients and dispersion, with correlation estimated using a REML objective. See section "Correlation Structure Estimation" for more details.

VhalfInv

Default: NULL. Matrix representing a fixed, custom square-root-inverse covariance structure for the response variable of longitudinal and spatial modeling. Must be an N \times N matrix where N is number of observations. This matrix \textbf{V}^{-1/2} serves as a fixed transformation matrix for the response, equivalent to GLS with known covariance \textbf{V}. This is known as "whitening" in some literature.

Vhalf

Default: NULL. Matrix representing a fixed, custom square-root covariance structure for the response variable of longitudinal and spatial modeling. Must be an N \times N matrix where N is number of observations. This matrix \textbf{V}^{1/2} is used when backtransforming coefficients for fitting GLMs with arbitrary correlation structure.

VhalfInv_fxn

Default: NULL. Function for parametric modeling of the covariance structure \textbf{V}^{-1/2}. Must take a single numeric vector argument "par" and return an N \times N matrix. When provided with VhalfInv_par_init, this function is optimized via BFGS to find optimal covariance parameters that minimize the negative REML log-likelihood (or custom loss if custom_VhalfInv_loss is specified). The function must return a valid square root of the inverse covariance matrix - i.e., if \textbf{V} is the true covariance, VhalfInv_fxn should return \textbf{V}^{-1/2} such that VhalfInv_fxn(par) * VhalfInv_fxn(par) = \textbf{V}^{-1}.

Vhalf_fxn

Default: NULL. Function for efficient computation of \textbf{V}^{1/2}, used only when optimizing correlation structures with non-canonical-Gaussian response.

VhalfInv_par_init

Default: c(). Numeric vector of initial parameter values for VhalfInv_fxn optimization. When provided with VhalfInv_fxn, triggers optimization of the covariance structure. Length determines the dimension of the parameter space. For example, for AR(1) correlation, this could be a single correlation parameter; for unstructured correlation, this could be all unique elements of the correlation matrix.

REML_grad

Default: NULL. Function for evaluating the gradient of the objective function (negative REML or custom loss) with respect to the parameters of VhalfInv_fxn. Must take the same "par" argument as VhalfInv_fxn, as well as second argument "model_fit" for the output of lgspline.fit and ellipses "..." as a third argument. It should return a vector of partial derivatives matching the length of par. When provided, enables more efficient optimization via analytical gradients rather than numerical approximation. Optional - if NULL, BFGS uses numerical gradients.

custom_VhalfInv_loss

Default: NULL. Alternative to negative REML for serving as the objective function for optimizing correlation parameters. Must take the same "par" argument as VhalfInv_fxn, as well as second argument "model_fit" for the output of lgspline.fit and ellipses "..." as a third argument. It should return a numeric scalar.

VhalfInv_logdet

Default: NULL. Function for efficient computation of \log|\textbf{V}^{-1/2}| that bypasses construction of the full \textbf{V}^{-1/2} matrix. Must take the same parameter vector 'par' as VhalfInv_fxn and return a scalar value equal to \log(\det(\textbf{V}^{-1/2})). When NULL, the determinant is computed directly from VhalfInv, which can be computationally expensive for large matrices.

include_warnings

Default: TRUE. Logical switch to control display of warning messages during model fitting.

...

Additional arguments passed to the unconstrained model fitting function.

Details

A flexible and interpretable implementation of smoothing splines including:

Value

A list containing the following components:

y

Original response vector.

ytilde

Fitted/predicted values on the scale of the response.

X

List of design matrices \textbf{X}_k for each partition k, containing basis expansions including intercept, linear, quadratic, cubic, and interaction terms as specified.

A

Constraint matrix \textbf{A} encoding smoothness constraints at knot points and any user-specified linear constraints.

B

List of fitted coefficients \boldsymbol{\beta}_k for each partition k on the original, unstandardized scale of the predictors and response.

B_raw

List of fitted coefficients for each partition on the predictor-and-response standardized scale.

K

Number of interior knots with one predictor (number of partitions minus 1 with > 1 predictor).

p

Number of basis expansions of predictors per partition.

q

Number of predictor variables.

P

Total number of coefficients (p \times (K+1)).

N

Number of observations.

penalties

List containing optimized penalty matrices and components:

  • Lambda: Combined penalty matrix (\boldsymbol{\Lambda}), includes \textbf{L}_{\text{predictor\_list}} contributions but not \textbf{L}_{\text{partition\_list}}.

  • L1: Smoothing spline penalty matrix (\textbf{L}_1).

  • L2: Ridge penalty matrix (\textbf{L}_2).

  • L predictor list: Predictor-specific penalty matrices (\textbf{L}_{\text{predictor\_list}}).

  • L partition list: Partition-specific penalty matrices (\textbf{L}_{\text{partition\_list}}).

knot_scale_transf

Function for transforming predictors to standardized scale used for knot placement.

knot_scale_inv_transf

Function for transforming standardized predictors back to original scale.

knots

Matrix of knot locations on original unstandarized predictor scale for one predictor.

partition_codes

Vector assigning observations to partitions.

partition_bounds

Vector or matrix specifying the boundaries between partitions.

knot_expand_function

Internal function for expanding data according to partition structure.

predict

Function for generating predictions on new data.

assign_partition

Function for assigning new observations to partitions.

family

GLM family object specifying the error distribution and link function.

estimate_dispersion

Logical indicating whether dispersion parameter was estimated.

unbias_dispersion

Logical indicating whether dispersion estimates should be unbiased.

backtransform_coefficients

Function for converting standardized coefficients to original scale.

forwtransform_coefficients

Function for converting coefficients to standardized scale.

mean_y, sd_y

Mean and standard deviation of response if standardized.

og_order

Original ordering of observations before partitioning.

order_list

List containing observation indices for each partition.

constraint_values, constraint_vectors

Matrices specifying linear equality constraints if provided.

make_partition_list

List containing partition information for > 1-D cases.

expansion_scales

Vector of scaling factors used for standardizing basis expansions.

take_derivative, take_interaction_2ndderivative

Functions for computing derivatives of basis expansions.

get_all_derivatives_insample

Function for computing all derivatives on training data.

numerics

Indices of numeric predictors used in basis expansions.

power1_cols, power2_cols, power3_cols, power4_cols

Column indices for linear through quartic terms.

quad_cols

Column indices for all quadratic terms (including interactions).

interaction_single_cols, interaction_quad_cols

Column indices for linear-linear and linear-quadratic interactions.

triplet_cols

Column indices for three-way interactions.

nonspline_cols

Column indices for terms excluded from spline expansion.

return_varcovmat

Logical indicating whether variance-covariance matrix was computed.

raw_expansion_names

Names of basis expansion terms.

std_X, unstd_X

Functions for standardizing/unstandardizing design matrices.

parallel_cluster_supplied

Logical indicating whether a parallel cluster was supplied.

weights

List of observation weights per partition.

G

List of unscaled unconstrained variance-covariance matrices \textbf{G}_k per partition k if return_G=TRUE. Computed as (\textbf{X}_k^T\textbf{X}_k + \boldsymbol{\Lambda}_\text{eff})^{-1} for partition k.

Ghalf

List of \textbf{G}_k^{1/2} matrices if return_Ghalf=TRUE.

U

Constraint projection matrix \textbf{U} if return_U=TRUE. For K=0 and no constraints, returns identity. Otherwise, returns \textbf{U} = \textbf{I} - \textbf{G}\textbf{A}(\textbf{A}^T\textbf{G}\textbf{A})^{-1}\textbf{A}^T. Used for computing the variance-covariance matrix \sigma^2 \textbf{U}\textbf{G}.

sigmasq_tilde

Estimated (or fixed) dispersion parameter \tilde{\sigma}^2.

trace_XUGX

Effective degrees of freedom (\text{trace}(\textbf{X}\textbf{U}\textbf{G}\textbf{X}^{T})).

varcovmat

Variance-covariance matrix of coefficient estimates if return_varcovmat=TRUE.

VhalfInv

The \textbf{V}^{-1/2} matrix used for implementing correlation structures, if specified.

VhalfInv_fxn, Vhalf_fxn, VhalfInv_logdet, REML_grad

Functions for generating \textbf{V}^{-1/2}, \textbf{V}^{1/2}, \log|\textbf{V}^{-1/2}|, and gradient of REML if provided.

VhalfInv_params_estimates

Vector of estimated correlation parameters when using VhalfInv_fxn.

VhalfInv_params_vcov

Approximate variance-covariance matrix of estimated correlation parameters from BFGS optimization.

wald_univariate

Function for computing univariate Wald statistics and confidence intervals.

critical_value

Critical value used for confidence interval construction.

generate_posterior

Function for drawing from the posterior distribution of coefficients.

find_extremum

Function for optimizing the fitted function.

plot

Function for visualizing fitted curves.

quadprog_list

List containing quadratic programming components if applicable.

When expansions_only=TRUE is used, a reduced list is returned containing only the following prior to any fitting or tuning:

X

Design matrices \textbf{X}_k

y

Response vectors \textbf{y}_k

A

Constraint matrix \textbf{A}

penalties

Penalty matrices

order_list, og_order

Ordering information

expansion_scales, colnm_expansions

Scaling and naming information

K, knots

Knot information

make_partition_list, partition_codes, partition_bounds

Partition information

constraint_vectors, constraint_values

Constraint information

quadprog_list

Quadratic programming components if applicable

The returned object has class "lgspline" and provides comprehensive tools for model interpretation, inference, prediction, and visualization. All coefficients and predictions can be transformed between standardized and original scales using the provided transformation functions. The object includes both frequentist and Bayesian inference capabilities through Wald statistics and posterior sampling. Advanced customization options are available for analyzing arbitrarily complex study designs. See Details for descriptions of the model fitting process.

See Also

Examples


## ## ## ## Simple Examples ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## Simulate some data, fit using default settings, and plot
set.seed(1234)
t <- runif(2500, -10, 10)
y <- 2*sin(t) + -0.06*t^2 + rnorm(length(t))
model_fit <- lgspline(t, y)
plot(t, y, main = 'Observed Data vs. Fitted Function, Colored by Partition')
plot(model_fit, add = TRUE)

## Repeat using logistic regression, with univariate inference shown
# and alternative function call
y <- rbinom(length(y), 1, 1/(1+exp(-std(y))))
df <- data.frame(t = t, y = y)
model_fit <- lgspline(y ~ spl(t),
                      df,
                      opt = FALSE, # no tuning penalties
                      family = quasibinomial())
plot(t, y, main = 'Observed Data vs Fitted Function with Formulas and Derivatives',
  ylim = c(-0.5, 1.05), cex.main = 0.8)
plot(model_fit,
     show_formulas = TRUE,
     text_size_formula = 0.65,
     legend_pos = 'bottomleft',
     legend_args = list(y.intersp = 1.1),
     add = TRUE)
## Notice how the coefficients match the formula, and expansions are
# homogenous across partitions without reparameterization
print(summary(model_fit))

## Overlay first and second derivatives of fitted function respectively
derivs <- predict(model_fit,
                  new_predictors = sort(t),
                  take_first_derivatives = TRUE,
                  take_second_derivatives = TRUE)
points(sort(t), derivs$first_deriv, col = 'gold', type = 'l')
points(sort(t), derivs$second_deriv, col = 'goldenrod', type = 'l')
legend('bottomright',
       col = c('gold','goldenrod'),
       lty = 1,
       legend = c('First Derivative', 'Second Derivative'))

## Simple 2D example - including a non-spline effect
z <- seq(-2, 2, length.out = length(y))
df <- data.frame(Predictor1 = t,
                 Predictor2 = z,
                 Response = sin(y)+0.1*z)
model_fit <- lgspline(Response ~ spl(Predictor1) + Predictor1*Predictor2,
                      df)

## Notice, while spline effects change over partitions,
# interactions and non-spline effects are constrained to remain the same
coefficients <- Reduce('cbind', coef(model_fit))
colnames(coefficients) <- paste0('Partition ', 1:(model_fit$K+1))
print(coefficients)

## One or two variables can be selected for plotting at a time
# even when >= 3 predictors are present
plot(model_fit,
      custom_title = 'Marginal Relationship of Predictor 1 and Response',
      vars = 'Predictor1',
      custom_response_lab = 'Response',
      show_formulas = TRUE,
      legend_pos = 'bottomright',
      digits = 4,
      text_size_formula = 0.5)

## 3D plots are implemented as well, retaining analytical formulas
my_plot <- plot(model_fit,
                show_formulas = TRUE,
                custom_response_lab = 'Response')
my_plot


## ## ## ## More Detailed 1D Example ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## 1D data generating functions
t <- seq(-9, 9, length.out = 1000)
slinky <- function(x) {
  (50 * cos(x * 2) -2 * x^2 + (0.25 * x)^4 + 80)
}
coil <- function(x) {
  (100 * cos(x * 2) +-1.5 * x^2 + (0.1 * x)^4 +
  (0.05 * x^3) + (-0.01 * x^5) +
     (0.00002 * x^6) -(0.000001 * x^7) + 100)
}
exponential_log <- function(x) {
  unlist(c(sapply(x, function(xx) {
    if (xx <= 1) {
      100 * (exp(xx) - exp(1))
    } else {
      100 * (log(xx))
    }
  })))
}
scaled_abs_gamma <- function(x) {
  2*sqrt(gamma(abs(x)))
}

## Composite function
fxn <- function(x)(slinky(t) +
                   coil(t) +
                   exponential_log(t) +
                   scaled_abs_gamma(t))

## Bind together with random noise
dat <- cbind(t, fxn(t) + rnorm(length(t), 0, 50))
colnames(dat) <- c('t', 'y')
x <- dat[,'t']
y <- dat[,'y']

## Fit Model, 4 equivalent ways are shown below
model_fit <- lgspline(t, y)
model_fit <- lgspline(y ~ spl(t), as.data.frame(dat))
model_fit <- lgspline(response = y, predictors = t)
model_fit <- lgspline(data = as.data.frame(dat), formula = y ~ .)

# This is not valid: lgspline(y ~ ., t)
# This is not valid: lgspline(y, data = as.data.frame(dat))
# Do not put operations in formulas, not valid: lgspline(y ~ log(t) + spl(t))

## Basic Functionality
predict(model_fit, new_predictors = rnorm(1)) # make prediction on new data
head(leave_one_out(model_fit)) # leave-one-out cross-validated predictions
coef(model_fit) # extract coefficients
summary(model_fit) # model information and Wald inference
generate_posterior(model_fit) # generate draws of parameters from posterior distribution
find_extremum(model_fit, minimize = TRUE) # find the minimum of the fitted function

## Incorporate range constraints, custom knots, keep penalization identical
# across partitions
model_fit <- lgspline(y ~ spl(t),
                      unique_penalty_per_partition = FALSE,
                      custom_knots = cbind(c(-2, -1, 0, 1, 2)),
                      data = data.frame(t = t, y = y),
                      qp_range_lower = -150,
                      qp_range_upper = 150)

## Plotting the constraints and knots
plot(model_fit,
     custom_title = 'Fitted Function Constrained to Lie Between (-150, 150)',
     cex.main = 0.75)
# knot locations
abline(v = model_fit$knots)
# lower bound from quadratic program
abline(h = -150, lty = 2)
# upper bound from quadratic program
abline(h = 150, lty = 2)
# observed data
points(t, y, cex = 0.24)

## Enforce monotonic increasing constraints on fitted values
# K = 4 => 5 partitions
t <- seq(-10, 10, length.out = 100)
y <- 5*sin(t) + t + 2*rnorm(length(t))
model_fit <- lgspline(t,
                      y,
                      K = 4,
                      qp_monotonic_increase = TRUE)
plot(t, y, main = 'Monotonic Increasing Function with Respect to Fitted Values')
plot(model_fit,
     add = TRUE,
     show_formulas = TRUE,
     legend_pos = 'bottomright',
     custom_predictor_lab = 't',
     custom_response_lab = 'y')

## ## ## ## 2D Example using Volcano Dataset ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## Prep
data('volcano')
volcano_long <-
  Reduce('rbind', lapply(1:nrow(volcano), function(i){
    t(sapply(1:ncol(volcano), function(j){
      c(i, j, volcano[i,j])
    }))
  }))
colnames(volcano_long) <- c('Length', 'Width', 'Height')

## Fit, with 50 partitions
# When fitting with > 1 predictor and large K, including quartic terms
# is highly recommended, and/or dropping the second-derivative constraint.
# Otherwise, the constraints can impose all partitions to be equal, with one
# cubic function fit for all (there isn't enough degrees of freedom to fit
# unique cubic functions due to the massive amount of constraints).
# Below, quartic terms are included and the constraint of second-derivative
# smoothness at knots is ignored.
model_fit <- lgspline(volcano_long[,c(1, 2)],
                      volcano_long[,3],
                      include_quadratic_interactions = TRUE,
                      K = 49,
                      opt = FALSE,
                      return_U = FALSE,
                      return_varcov = FALSE,
                      estimate_variance = TRUE,
                      return_Ghalf = FALSE,
                      return_G = FALSE,
                      include_constrain_second_deriv = FALSE,
                      unique_penalty_per_predictor = FALSE,
                      unique_penalty_per_partition = FALSE,
                      wiggle_penalty = 1e-5, # the fixed wiggle penalty
                      flat_ridge_penalty = 1e-4) # the ridge penalty

## Plotting on new data with interactive visual + formulas
new_input <- expand.grid(seq(min(volcano_long[,1]),
                             max(volcano_long[,1]),
                             length.out = 250),
                         seq(min(volcano_long[,2]),
                             max(volcano_long[,2]),
                             length.out = 250))
model_fit$plot(new_predictors = new_input,
               show_formulas = TRUE,
               custom_response_lab = "Height",
               custom_title = 'Volcano 3-D Map',
               digits = 2)

## ## ## ## Advanced Techniques using Trees Dataset ## ## ## ## ## ## ## ## ## ## ## ## ##
## Goal here is to introduce how lgspline works with non-canonical GLMs and
# demonstrate some custom features
data('trees')

## L1-regularization constraint function on standardized coefficients
# Bound all coefficients to be less than a certain value (l1_bound) in absolute
# magnitude such that | B^{(j)}_k | < lambda for all j = 1....p coefficients,
# and k = 1...K+1 partitions.
l1_constraint_matrix <- function(p, K) {
  ## Total number of coefficients
  P <- p * (K + 1)

  ## Create diagonal matrices for L1 constraint
  # First matrix: lamdba > -bound
  # Second matrix: -lambda > -bound
  first_diag <- diag(P)
  second_diag <- -diag(P)

  ## Combine matrices
  l1_Amat <- cbind(first_diag, second_diag)

  return(l1_Amat)
}

## Bounds absolute value of coefficients to be < l1_bound
l1_bound_vector <- function(qp_Amat,
                            scales,
                            l1_bound) {

  ## Combine matrices
  l1_bvec <- rep(-l1_bound, ncol(qp_Amat)) * c(1, scales)

  return(l1_bvec)
}

## Fit model, using predictor-response formulation, assuming
# Gamma-distributed response, and custom quadratic-programming constraints,
# with qp_score_function/glm_weight_function updated for non-canonical GLMs
# as well as quartic terms, keeping the effect of height constant across
# partitions, and 3 partitions in total. Hence, this is an advanced-usage
# case.
# You can modify this code for performing l1-regularization in general.
# For canonical GLMs, the default qp_score_function/glm_weight_function are
# correct and do not need to be changed
# (custom functionality is not needed for canonical GLMs).
model_fit <- lgspline(
  Volume ~ spl(Girth) + Height*Girth,
  data = with(trees, cbind(Girth, Height, Volume)),
  family = Gamma(link = 'log'),
  keep_weighted_Lambda = TRUE,
  glm_weight_function = function(
    mu,
    y,
    order_indices,
    family,
    dispersion,
    observation_weights,
   ...){
     rep(1/dispersion, length(y))
   },
   dispersion_function = function(
     mu,
     y,
     order_indices,
     family,
     observation_weights,
   ...){
    mean(
      mu^2/((y-mu)^2)
    )
  }, # = biased estimate of 1/shape parameter
  need_dispersion_for_estimation = TRUE,
  unbias_dispersion = TRUE, # multiply dispersion by N/(N-trace(XUGX^{T}))
  K = 2, # 3 partitions
  opt = FALSE, # keep penalties fixed
  unique_penalty_per_partition = FALSE,
  unique_penalty_per_predictor = FALSE,
  flat_ridge_penalty = 1e-64,
  wiggle_penalty = 1e-64,
  qp_score_function = function(X, y, mu, order_list, dispersion, VhalfInv,
    observation_weights, ...){
   t(X) %**% diag(c(1/mu * 1/dispersion)) %**% cbind(y - mu)
  }, # updated score for gamma regression with log link
  qp_Amat_fxn = function(N, p, K, X, colnm, scales, deriv_fxn, ...) {
    l1_constraint_matrix(p, K)
  },
  qp_bvec_fxn = function(qp_Amat, N, p, K, X, colnm, scales, deriv_fxn, ...) {
    l1_bound_vector(qp_Amat, scales, 25)
  },
  qp_meq_fxn = function(qp_Amat, N, p, K, X, colnm, scales, deriv_fxn, ...) 0
)

## Notice, interaction effect is constant across partitions as is the effect
# of Height alone
Reduce('cbind', coef(model_fit))
print(summary(model_fit))

## Plot results
plot(model_fit, custom_predictor_lab1 = 'Girth',
     custom_predictor_lab2 = 'Height',
     custom_response_lab = 'Volume',
     custom_title = 'Girth and Height Predicting Volume of Trees',
     show_formulas = TRUE)

## Verify magnitude of unstandardized coefficients does not exceed bound (25)
print(max(abs(unlist(model_fit$B))))

## Find height and girth where tree volume is closest to 42
# Uses custom objective that minimizes MSE discrepancy between predicted
# value and 42.
# The vanilla find_extremum function can be thought of as
# using "function(mu)mu" aka the identity function as the
# objective, where mu = "f(t)", our estimated function. The derivative is then
# d_mu = "f'(t)" with respect to predictors t.
# But with more creative objectives, and since we have machinery for
# f'(t) already available, we can compute gradients for (and optimize)
# arbitrary differentiable functions of our predictors too.
# For any objective, differentiate w.r.t. to mu, then multiply by d_mu to
# satisfy chain rule.
# Here, we have objective function: 0.5*(mu-42)^2
# and gradient                    : (mu-42)*d_mu
# and L-BFGS-B will be used to find the height and girth that most closely
# yields a prediction of 42 within the bounds of the observed data.
# The d_mu also takes into account link function transforms automatically
# for most common link functions, and will return warning + instructions
# on how to program the link-function derivatives otherwise.

## Custom acquisition functions for Bayesian optimization could be coded here.
find_extremum(
  model_fit,
  minimize = TRUE,
  custom_objective_function = function(mu, sigma, ybest, ...){
    0.5*(mu - 42)^2
  },
  custom_objective_derivative = function(mu, sigma, ybest, d_mu, ...){
    (mu - 42) * d_mu
  }
)

## ## ## ## How to Use Formulas in lgspline ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## Demonstrates splines with multiple mixed predictors and interactions

## Generate data
n <- 2500
x <- rnorm(n)
y <- rnorm(n)
z <- sin(x)*mean(abs(y))/2

## Categorical predictors
cat1 <- rbinom(n, 1, 0.5)
cat2 <- rbinom(n, 1, 0.5)
cat3 <- rbinom(n, 1, 0.5)

## Response with mix of effects
response <- y + z + 0.1*(2*cat1 - 1)

## Continuous predictors re-named
continuous1 <- x
continuous2 <- z

## Combine data
dat <- data.frame(
  response = response,
  continuous1 = continuous1,
  continuous2 = continuous2,
  cat1 = cat1,
  cat2 = cat2,
  cat3 = cat3
)

## Example 1: Basic Model with Default Terms, No Intercept
# standardize_response = FALSE often needed when constraining intercepts to 0
fit1 <- lgspline(
  formula = response ~ 0 + spl(continuous1, continuous2) +
    cat1*cat2*continuous1 + cat3,
  K = 2,
  standardize_response = FALSE,
  data = dat
)
## Examine coefficients included
rownames(fit1$B$partition1)
## Verify intercept term is near 0 up to some numeric tolerance
abs(fit1$B[[1]][1]) < 1e-8

## Example 2: Similar Model with Intercept, Other Terms Excluded
fit2 <- lgspline(
  formula = response ~ spl(continuous1, continuous2) +
    cat1*cat2*continuous1 + cat3,
  K = 1,
  standardize_response = FALSE,
  include_cubic_terms = FALSE,
  exclude_these_expansions = c( # Not all need to actually be present
    '_batman_x_robin_',
    '_3_x_4_', # no cat1 x cat2 interaction, coded using column indices
    'continuous1xcontinuous2', # no continuous1 x continuous2 interaction
    'thejoker'
  ),
  data = dat
)
## Examine coefficients included
rownames(Reduce('cbind',coef(fit2)))
# Intercept will probably be present and non-0 now
abs(fit2$B[[1]][1]) < 1e-8

## ## ## ## Compare Inference to survreg for Weibull AFT Model Validation ##
# Only linear predictors, no knots, no penalties, using Weibull AFT Model
# The goal here is to ensure that for the special case of no spline effects
# and no knots, this implementation will be consistent with other model
# implementations.
# Also note, that when using models (like Weibull AFT) where dispersion is
# being estimated and is required for estimating beta coefficients,
# we use a shur complement correction function to adjust (or "correct") our
# variance-covariance matrix for both estimation and inference to account for
# uncertainty in estimating the dispersion.
# Typically the shur_correction_function would return a negative-definite
# matrix, as it's output is elementwise added to the information matrix prior
# to inversion.
require(survival)
df <- data.frame(na.omit(
  pbc[,c('time','trt','stage','hepato','bili','age','status')]
))

## Weibull AFT using lgspline, showing how some custom options can be used to
# fit more complicated models
model_fit <- lgspline(time ~ trt + stage + hepato + bili + age,
                      df,
                      family = weibull_family(),
                      need_dispersion_for_estimation = TRUE,
                      dispersion_function = weibull_dispersion_function,
                      glm_weight_function = weibull_glm_weight_function,
                      shur_correction_function = weibull_shur_correction,
                      unconstrained_fit_fxn = unconstrained_fit_weibull,
                      opt = FALSE,
                      wiggle_penalty = 0,
                      flat_ridge_penalty = 0,
                      K = 0,
                      status = pbc$status!=0)
print(summary(model_fit))

## Survreg results match closely on estimates and inference for coefficients
survreg_fit <- survreg(Surv(time, status!=0) ~ trt + stage + hepato + bili + age,
                       df)
print(summary(survreg_fit))

## sigmasq_tilde = scale^2 of survreg
print(c(sqrt(model_fit$sigmasq_tilde), survreg_fit$scale))

## ## ## ## Modelling Correlation Structures ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## Setup
n_blocks <- 150 # Number of correlation_ids (subjects)
block_size <- 5 # Size of each correlation_ids (number of repeated measures per subj.)
N <- n_blocks * block_size # total sample size (balanced here)
rho_true <- 0.25  # True correlation

## Generate predictors and mean structure
t <- seq(-9, 9, length.out = N)
true_mean <- sin(t)

## Create block compound symmetric errors = I(1-p) + Jp
errors <- Reduce('rbind',
                 lapply(1:n_blocks,
                        function(i){
                          sigma <- diag(block_size) + rho_true *
                            (matrix(1, block_size, block_size) -
                               diag(block_size))
                          matsqrt(sigma) %*% rnorm(block_size)
                        }))

## Generate response with correlated errors
y <- true_mean + errors * 0.5

## Fit model with correlation structure
# include_warnings = FALSE is a good idea here, since many proposed
# correlations won't work
model_fit <- lgspline(t,
                      y,
                      K = 4,
                      correlation_id = rep(1:n_blocks, each = block_size),
                      correlation_structure = 'exchangeable',
                      include_warnings = FALSE
)

## Assess overall fit
plot(t, y, main = 'Sinosudial Fit Under Correlation Structure')
plot(model_fit, add = TRUE, show_formulas = TRUE, custom_predictor_lab = 't')

## Compare estimated vs true correlation
rho_est <- tanh(model_fit$VhalfInv_params_estimates)
print(c("True correlation:" = rho_true,
        "Estimated correlation:" = rho_est))

## Quantify uncertainty in correlation estimate with 95% confidence interval
se <- c(sqrt(diag(model_fit$VhalfInv_params_vcov))) / sqrt(model_fit$N)
ci <- tanh(model_fit$VhalfInv_params_estimates + c(-1.96, 1.96)*se)
print("95% CI for correlation:")
print(ci)

## Also check SD (should be close to 0.5)
print(sqrt(model_fit$sigmasq_tilde))

## Toeplitz Simulation Setup, with demonstration of custom functions
# and boilerplate. Toep is not implemented by default, because it makes
# strong assumptions on the study design and missingness that are rarely met,
# with non-obvious workarounds.
# If a GLM was to-be-fit, you'd also submit a function "Vhalf_fxn" analogous
# to VhalfInv_fxn with same argument (par) and an output of an N x N matrix
# that yields the inverse of VhalfInv_fxn output.
n_blocks <- 150   # Number of correlation_ids
block_size <- 4   # Observations per correlation_id
N <- n_blocks * block_size # total sample size
rho_true <- 0.5    # True correlation within correlation_ids
true_intercept <- 2     # True intercept
true_slope <- 0.5       # True slope for covariate

## Create design matrix with meaningful predictors
Tmat <- matrix(0, N, 2)
Tmat[,1] <- 1  # Intercept
Tmat[,2] <- cos(rnorm(N))  # Continuous predictor

## True coefficients
beta <- c(true_intercept, true_slope)

## Create time and correlation_id variables
time_var <- rep(1:block_size, n_blocks)
correlation_id_var <- rep(1:n_blocks, each = block_size)

## Create block compound symmetric errors
errors <- Reduce('rbind',
                 lapply(1:n_blocks, function(i) {
                   sigma <- diag(block_size) + rho_true *
                     (matrix(1, block_size, block_size) -
                        diag(block_size))
                   matsqrt(sigma) %*% rnorm(block_size)
                 }))

## Generate response with correlated errors and covariate effect
y <- Tmat %*% beta + errors * 2

## Toeplitz correlation function
VhalfInv_fxn <- function(par) {
  # Initialize correlation matrix
  corr <- matrix(0, block_size, block_size)

  # Construct Toeplitz matrix with correlation by lag
  for(i in 1:block_size) {
    for(j in 1:block_size) {
      lag <- abs(time_var[i] - time_var[j])
      if(lag == 0) {
        corr[i,j] <- 1
      } else if(lag <= length(par)) {
        # Use tanh to bound correlations between -1 and 1
        corr[i,j] <- tanh(par[lag])
      }
    }
  }

  ## Matrix square root inverse
  corr_inv_sqrt <- matinvsqrt(corr)

  ## Expand to full matrix using Kronecker product
  kronecker(diag(n_blocks), corr_inv_sqrt)
}

## Determinant function (for efficiency)
# This avoids taking determinant of N by N matrix
VhalfInv_logdet <- function(par) {
  # Initialize correlation matrix
  corr <- matrix(0, block_size, block_size)

  # Construct Toeplitz matrix
  for(i in 1:block_size) {
    for(j in 1:block_size) {
      lag <- abs(time_var[i] - time_var[j])
      if(lag == 0) {
        corr[i,j] <- 1
      } else if(lag <= length(par)) {
        corr[i,j] <- tanh(par[lag])
      }
    }
  }

  # Compute log determinant
  log_det_invsqrt_corr <- -0.5 * determinant(corr, logarithm=TRUE)$modulus[1]
  return(n_blocks * log_det_invsqrt_corr)
}

## REML gradient function
REML_grad <- function(par, model_fit, ...) {
  ## Initialize gradient vector
  n_par <- length(par)
  gradient <- numeric(n_par)

  ## Get dimensions and organize data
  nr <- nrow(model_fit$X[[1]])

  ## Process derivatives one parameter at a time
  for(p in 1:n_par) {
    ## Initialize derivative matrix
    dV <- matrix(0, nrow(model_fit$VhalfInv), ncol(model_fit$VhalfInv))
    V <- matrix(0, nrow(model_fit$VhalfInv), ncol(model_fit$VhalfInv))

    ## Compute full correlation matrix and its derivative for parameter p
    for(clust in unique(correlation_id_var)) {
      inds <- which(correlation_id_var == clust)
      block_size <- length(inds)

      ## Initialize block matrices
      V_block <- matrix(0, block_size, block_size)
      dV_block <- matrix(0, block_size, block_size)

      ## Construct Toeplitz matrix and its derivative
      for(i in 1:block_size) {
        for(j in 1:block_size) {
          ## Compute lag between observations
          lag <- abs(time_var[i] - time_var[j])

          ## Diagonal is always 1
          if(i == j) {
            V_block[i,j] <- 1
            dV_block[i,j] <- 0
          } else {
            ## Correlation for off-diagonal depends on lag
            if(lag <= length(par)) {
              ## Correlation via tanh parameterization
              V_block[i,j] <- tanh(par[lag])

              ## Derivative for the relevant parameter
              if(lag == p) {
                ## Chain rule for tanh: d/dx tanh(x) = 1 - tanh^2(x)
                dV_block[i,j] <- 1 - tanh(par[p])^2
              }
            }
          }
        }
      }

      ## Assign blocks to full matrices
      V[inds, inds] <- V_block
      dV[inds, inds] <- dV_block
    }

    ## GLM Weights based on current model fit (all 1s for normal)
    glm_weights <- rep(1, model_fit$N)

    ## Quadratic form contribution
    resid <- model_fit$y - model_fit$ytilde
    VinvResid <- model_fit$VhalfInv %**% cbind(resid) / glm_weights
    quad_term <- -0.5 * ((t(VinvResid) %**% dV) %**% VinvResid) /
      model_fit$sigmasq_tilde

    ## Log|V| contribution - trace term
    trace_term <- 0.5 * sum(diag(model_fit$VhalfInv %**%
                                   model_fit$VhalfInv %**%
                                   dV))

    ## Information matrix contribution
    U <- t(t(model_fit$U) * rep(c(1, model_fit$expansion_scales),
                                model_fit$K + 1)) /
      model_fit$sd_y
    VhalfInvX <- model_fit$VhalfInv %**%
      collapse_block_diagonal(model_fit$X)[unlist(
        model_fit$og_order
      ),] %**%
      U

    ## Lambda computation for GLMs
    if(length(model_fit$penalties$L_partition_list) != (model_fit$K + 1)){
      model_fit$penalties$L_partition_list <- lapply(
        1:(model_fit$K + 1), function(k)0
      )
    }
    Lambda <- U %**% collapse_block_diagonal(
      lapply(1:(model_fit$K + 1),
             function(k)
               c(1, model_fit$expansion_scales) * (
                 model_fit$penalties$L_partition_list[[k]] +
                   model_fit$penalties$Lambda) %**%
               diag(c(1, model_fit$expansion_scales)) /
               model_fit$sd_y^2
      )
    ) %**% t(U)

    XVinvX_inv <- invert(gramMatrix(t(t(VhalfInvX)*c(glm_weights))) +
                           Lambda)
    VInvX <- model_fit$VhalfInv %**% VhalfInvX
    sc <- sqrt(norm(VInvX, '2'))
    VInvX <- VInvX/sc
    dXVinvX <-
      (XVinvX_inv %**% t(VInvX)) %**%
      (dV %**% VInvX)
    XVinvX_term <- -0.5 * colSums(cbind(c(diag(dXVinvX) * sc))) * sc

    ## Store gradient component (all three terms)
    gradient[p] <- as.numeric(quad_term + trace_term + XVinvX_term)
  }

  ## Return normalized gradient
  return(gradient / model_fit$N)
}

## Visualization
plot(time_var, y, col = correlation_id_var,
     main = "Simulated Data with Toeplitz Correlation")

## Fit model with custom Toeplitz (takes ~ 5-10 minutes on my laptop)
# Note, for GLMs, efficiency would be improved by supplying a Vhalf_fxn
# although strictly only VhalfInv_fxn and VhalfInv_par_init are needed
model_fit <- lgspline(
  response = y,
  predictors = Tmat[,2],
  K = 4,
  VhalfInv_fxn = VhalfInv_fxn,
  VhalfInv_logdet = VhalfInv_logdet,
  REML_grad = REML_grad,
  VhalfInv_par_init = c(0, 0, 0),
  include_warnings = FALSE
)

## Print comparison of true and estimated correlations
rho_true <- rep(0.5, 3)
rho_est <- tanh(model_fit$VhalfInv_params_estimates)
cat("Correlation Estimates:\n")
print(data.frame(
  "True Correlation" = rho_true,
  "Estimated Correlation" = rho_est
))

## Should be ~ 2
print(sqrt(model_fit$sigmasq_tilde))

## ## ## ## Parallelism ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## Data generating function
a <- runif(500000, -9, 9)
b <- runif(500000, -9, 9)
c <- rnorm(500000)
d <- rpois(500000, 1)
y <- sin(a) + cos(b) - 0.2*sqrt(a^2 + b^2) +
  abs(a) + b +
  0.5*(a^2 + b^2) +
  (1/6)*(a^3 + b^3) +
  a*b*c -
  c +
  d +
  rnorm(500000, 0, 5)

## Set up cores
cl <- parallel::makeCluster(1)
on.exit(parallel::stopCluster(cl))

## This example shows some options for what operations can be parallelized
# By default, only parallel_eigen and parallel_unconstrained are TRUE
# G, G^{-1/2}, and G^{1/2} are computed in parallel across each of the
# K+1 partitions.
# However, parallel_unconstrained only affects GLMs without corr. components
# - it does not affect fitting here
system.time({
  parfit <- lgspline(y ~ spl(a, b) + a*b*c + d,
                     data = data.frame(y = y,
                                       a = a,
                                       b = b,
                                       c = c,
                                       d = d),
                     cl = cl,
                     K = 1,
                     parallel_eigen = TRUE,
                     parallel_unconstrained = TRUE,
                     parallel_aga = FALSE,
                     parallel_find_neighbors = FALSE,
                     parallel_trace = FALSE,
                     parallel_matmult = FALSE,
                     parallel_make_constraint = FALSE,
                     parallel_penalty = FALSE)
})
parallel::stopCluster(cl)
print(summary(parfit))



[Package lgspline version 0.2.0 Index]