% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/FormatCV.R
\name{FormatCV}
\alias{FormatCV}
\title{Format multiple trials with or without overlapping genotypes into
  training and test sets according to user-provided cross validation scheme}
\usage{
FormatCV(
  trial1,
  trial2,
  trial3 = NULL,
  cv.scheme,
  seed = NULL,
  remove.genotype = FALSE
)
}
\arguments{
\item{trial1}{\code{data.frame} object that is for use only when
\code{cv.scheme} is provided. Contains the trial to be tested in subsequent
model training functions. The first column contains unique identifiers,
second contains genotypes, third contains reference values, followed by
spectral columns. Include no other columns to right of spectra! Column
names of spectra must start with "X", reference column must be named
"reference", and genotype column must be named "genotype".}

\item{trial2}{\code{data.frame} object that is for use only when
\code{cv.scheme} is provided. This data.frame contains a trial that has
overlapping genotypes with \code{trial1} but that were grown in a different
site/year (different environment). Formatting must be consistent with
\code{trial1}.}

\item{trial3}{\code{data.frame} object that is for use only when
\code{cv.scheme} is provided. This data.frame contains a trial that may or
may not contain genotypes that overlap with \code{trial1}. Formatting must
be consistent with \code{trial1}.}

\item{cv.scheme}{A cross validation (CV) scheme from Jarquín et al., 2017.
Options for cv.scheme include:
\itemize{
    \item "CV1": untested lines in tested environments
    \item "CV2": tested lines in tested environments
    \item "CV0": tested lines in untested environments
    \item "CV00": untested lines in untested environments
}}

\item{seed}{Number used in the function \code{set.seed()} for reproducible
randomization. If \code{NULL}, no seed is set. Default is \code{NULL}.}

\item{remove.genotype}{boolean that, if \code{TRUE}, removes the "genotype"
column is removed from the output \code{data.frame}. Default is
\code{FALSE}.}
}
\value{
List of data.frames (training set, test set) compiled according to
  user-provided cross validation scheme.
}
\description{
Standalone function that is also used within
  \code{\link{TrainSpectralModel}} to divide trials or studies into training and test
  sets based on overlap in trial environments and genotype entries
}
\details{
Use of a cross-validation scheme requires a column in the input
  \code{data.frame} named "genotype" to ensure proper sorting of training and
  test sets. Variables \code{trial1} and \code{trial2} are required, while
  \code{trial 3} is optional.
}
\examples{
# Must have a column called "genotype", so we'll create a fake one for now
# We will use CV00, which does not require any overlap in genotypes
# In real scenarios, CV schemes that rely on genotypes should not be applied when
# genotypes are unknown, as in this case.
library(magrittr)
trials <- ikeogu.2017 \%>\%
    dplyr::mutate(genotype = 1:nrow(ikeogu.2017)) \%>\% # fake for this example
    dplyr::rename(reference = DMC.oven) \%>\%
    dplyr::select(study.name, sample.id, genotype, reference,
                  dplyr::starts_with("X"))
trial1 <- trials \%>\%
  dplyr::filter(study.name == "C16Mcal") \%>\%
  dplyr::select(-study.name)
trial2 <- trials \%>\%
  dplyr::filter(study.name == "C16Mval") \%>\%
  dplyr::select(-study.name)
cv.list <- FormatCV(trial1 = trial1, trial2 = trial2, cv.scheme = "CV00",
                    remove.genotype = TRUE)
cv.list[[1]][1:5, 1:5]
}
\references{
Jarquín, D., C. Lemes da Silva, R. C. Gaynor, J. Poland, A.
  Fritz, R. Howard, S. Battenfield, and J. Crossa. 2017. Increasing
  genomic-enabled prediction accuracy by modeling genotype × environment
  interactions in Kansas wheat. Plant Genome 10(2):1-15.
  <doi:10.3835/plantgenome2016.12.0130>
}
\author{
Jenna Hershberger \email{jmh579@cornell.edu}
}
