% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/snp_pruning.R
\name{snp.pruning}
\alias{snp.pruning}
\title{Reduces the number of redundant markers on a molecular matrix M by pruning}
\usage{
snp.pruning(
  M = NULL,
  map = NULL,
  marker = NULL,
  chrom = NULL,
  pos = NULL,
  method = c("correlation"),
  criteria = c("callrate", "maf"),
  pruning.thr = 0.95,
  by.chrom = FALSE,
  window.n = 50,
  overlap.n = 5,
  iterations = 10,
  seed = NULL,
  message = TRUE
)
}
\arguments{
\item{M}{A matrix with marker data of full form (\eqn{n \times p}), with \eqn{n} individuals
and \eqn{p} markers. Individual and marker names are assigned to \code{rownames} and \code{colnames}, respectively.
Data in matrix is coded as 0, 1, 2 (integer or numeric) (default = \code{NULL}).}

\item{map}{(Optional) A data frame with the map information with \eqn{p} rows.
If \code{NULL} a dummy map is generated considering a single chromosome and sequential positions
for markers. A \code{map} is mandatory if \code{by.chrom = TRUE}, where also option \code{chrom}
must also be non-null.}

\item{marker}{A character indicating the name of the column in data frame \code{map}
with the identification
of markers. This is mandatory if \code{map} is provided (default = \code{NULL}).}

\item{chrom}{A character indicating the name of the column in data frame \code{map} with the identification
of chromosomes. This is mandatory if \code{map} is provided (default = \code{NULL}).}

\item{pos}{A character indicating the name of the column in data frame \code{map} with the identification
of marker positions (default = \code{NULL}).}

\item{method}{A character indicating the method (or algorithm) to be used as reference for
identifying redundant markers.
The only method currently available is based on correlations (default = \code{"correlation"}).}

\item{criteria}{A character indicating the criteria to choose which marker to drop
from a detected redundant pair.
Options are: \code{"callrate"} (the marker with fewer missing values will be kept) and
\code{"maf"} (the marker with higher minor allele frequency will be kept) (default = \code{"callrate"}).}

\item{pruning.thr}{A threshold value to identify redundant markers with Pearson's correlation larger than the
value provided (default = \code{0.95}).}

\item{by.chrom}{If TRUE the pruning is performed independently by chromosome (default = \code{FALSE}).}

\item{window.n}{A numeric value with number of markers to consider in each
window to perform pruning (default = \code{50}).}

\item{overlap.n}{A numeric value with number of markers to overlap between consecutive windows
(default = \code{5}).}

\item{iterations}{An integer indicating the number of sequential times the pruning procedure
should be executed on remaining markers.
If no markers are dropped in a given iteration/run, the algorithm will stop (default = \code{10}).}

\item{seed}{An integer to be used as seed for reproducibility. In case the criteria has the
same values for a given pair of markers, one will be dropped at random (default = \code{NULL}).}

\item{message}{If \code{TRUE} diagnostic messages are printed on screen (default = \code{TRUE}).}
}
\value{
\itemize{
 \item{\code{Mpruned}: a matrix containing the pruned marker \emph{M} matrix.}
 \item{\code{map}: an data frame containing the pruned map.}
}
}
\description{
For a given molecular dataset \eqn{\boldsymbol{M}} (in the format 0, 1 and 2)
it produces a reduced molecular matrix by eliminating "redundant"
markers using pruning techniques. This function finds and drops some of the
SNPs in high linkage disequilibrium (LD).
}
\details{
Pruning is recommended as redundancies can affect
the quality of matrices used for downstream analyses.
The algorithm used is based on the Pearson's correlation between markers
as a \emph{proxy} for LD. In the event of a pairwise correlation higher
than the selected threshold markers will be eliminated as specified by: call rate,
minor allele frequency. In case of tie, one marker will be dropped at random.

Filtering markers (\link{qc.filtering}) is of high relevance before pruning.
Poor quality markers (\emph{e.g.}, monomorphic markers) may prevent correlations from being
calculated and may affect eliminations.
}
\examples{
# Read and filter genotypic data.
M.clean <- qc.filtering(
 M = geno.pine655,
 maf = 0.05,
 marker.callrate = 0.20, ind.callrate = 0.20,
 Fis = 1, heterozygosity = 0.98,
 na.string = "-9",
 plots = FALSE)$M.clean

# Prune correlations > 0.9.
Mpr <- snp.pruning(
 M = M.clean, pruning.thr = 0.90,
 by.chrom = FALSE, window.n = 40, overlap.n = 10)
head(Mpr$map)
Mpr$Mpruned[1:5, 1:5]

}
