% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preprocess.R
\name{preprocess}
\alias{preprocess}
\title{(Internal function) Perform the pre-processing step of IPCAPS}
\usage{
preprocess(
  files,
  label.file,
  lab.col,
  rdata.infile,
  bed.infile,
  cate.list,
  result.dir,
  threshold,
  min.fst,
  max.thread = NA,
  reanalysis = FALSE,
  method = "mix",
  min.in.group = 20,
  datatype = "snp",
  nonlinear = FALSE,
  missing.char = NA,
  regression.file = NA,
  regression.col.first = NA,
  regression.col.last = NA,
  reg.method = "linear",
  plot.as.pdf = NA,
  no.plot = NA
)
}
\arguments{
\item{files}{IPCAPS supports SNPs encoded as 0, 1 and 2 (dosage encoding).
Rows represent SNPs and columns represent subjects. Each column needs to be
separated by a space or a tab. A big text file should be divided into smaller
files to load faster. For instance, to input 3 files, use as: files=c(
'input1.txt', 'input2.txt', 'input3.txt').}

\item{label.file}{An additional useful information (called "labels" in
IPCAPS) related subject, for example, geographic location or disease
phenotype. These labels (one at a time) are used in displaying the clustering
outcome of IPCAPS. A label file must contain at least one column. However,
it may contain more than one column in which case each column need to be
separated by a space or a tab.}

\item{lab.col}{The label in the label file to be used in the tree-like
display of IPCAPS clustering results.}

\item{rdata.infile}{In case of re-analysis, it is convenient to run IPCAPS
using the file rawdata.RData generated by IPCAPS. This file contains a matrix
of SNPs
(raw.data) and a vector of labels (label).}

\item{bed.infile}{A PLINK binary format consists of 3 files; bed, bim, and
fam. To generate these files from PLINK, use option –make-bed. See more
details at: \url{http://zzz.bwh.harvard.edu/plink/data.shtml}.}

\item{cate.list}{(Unimplemented) A list of categorical input file (text). For
instance, to input 3 files, use as: files=c('input1.txt', 'input2.txt',
'input3.txt').}

\item{result.dir}{To set an absolute path for IPCAPS output. If the specified output
directory already exists, result files are saved in sub-directories
cluster_out, cluster_out1, cluster_out2, etc.}

\item{threshold}{Cutoff value for EigenFit.}

\item{min.fst}{Minimum Fst between a pair of subgroups.}

\item{max.thread}{(Require the parallelization patch) Maximum number of
concurrent threads.}

\item{reanalysis}{(Unimplemented) To specify whether it is re-analysis or
not. If TRUE, it is re-analysis, otherwise it is not. Default = FALSE.}

\item{method}{The internal clustering method. It can be set to "mix"
(rubikclust & mixmod), "mixmod" (Lebret et al., 2015), "clara" (R: Clustering
Large Applications), "pam" (R: Partitioning Around Medoids (PAM) Object),
"meanshift" (Wang, 2016), "apcluster" (Bodenhofer et al., 2016), and "hclust"
(R: Hierarchical Clustering). Default = "mix".}

\item{min.in.group}{Minimum number of individuals to constitute a cluster or
subgroup.}

\item{datatype}{To specify whether the input data are 'snp' or other type.
Defalut = 'snp'.}

\item{nonlinear}{(Unimplemented) To specify whether linear or non-linear
method is used for IPCAPS analysis. If TRUE, non-linear method is used,
otherwise linear method is used. Default = FALSE.}

\item{missing.char}{Symbol used for missing genotypes. Default = NA.}

\item{regression.file}{A file of covariates; one covariate per column. SNPs
can be adjusted for these covariates via regression modeling and residual
computation.}

\item{regression.col.first}{Refer to a covariate file, the first covariate to
be considered as confounding variable.}

\item{regression.col.last}{Refer to a covariate file, the last covariate to be
considered as confounding variable. All the variables in between the
cov.col.first and cov.col.last will be considered in the adjustment process.}

\item{reg.method}{(Fixed) Specify a method for regression analysis.
Default = 'linear'.}

\item{plot.as.pdf}{To export plots as PDF. When omitted, plots are saved as
PNG.}

\item{no.plot}{No plot is generated if this option is TRUE. This option is
useful when the system does not support X Windows in the unix based system.
Default = FALSE.}
}
\value{
A data frame of input data.
}
\description{
(Internal function) Perform the pre-processing step of IPCAPS
}
